howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26 27from howard.functions.commons import * 28from howard.objects.database import * 29from howard.functions.databases import * 30from howard.functions.utils import * 31 32 33class Variants: 34 35 def __init__( 36 self, 37 conn=None, 38 input: str = None, 39 output: str = None, 40 config: dict = {}, 41 param: dict = {}, 42 load: bool = False, 43 ) -> None: 44 """ 45 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 46 header 47 48 :param conn: the connection to the database 49 :param input: the input file 50 :param output: the output file 51 :param config: a dictionary containing the configuration of the model 52 :param param: a dictionary containing the parameters of the model 53 """ 54 55 # Init variables 56 self.init_variables() 57 58 # Input 59 self.set_input(input) 60 61 # Config 62 self.set_config(config) 63 64 # Param 65 self.set_param(param) 66 67 # Output 68 self.set_output(output) 69 70 # connexion 71 self.set_connexion(conn) 72 73 # Header 74 self.set_header() 75 76 # Load data 77 if load: 78 self.load_data() 79 80 def set_input(self, input: str = None) -> None: 81 """ 82 The function `set_input` takes a file name as input, extracts the name and extension, and sets 83 attributes in the class accordingly. 84 85 :param input: The `set_input` method in the provided code snippet is used to set attributes 86 related to the input file. Here's a breakdown of the parameters and their usage in the method: 87 :type input: str 88 """ 89 90 if input and not isinstance(input, str): 91 try: 92 self.input = input.name 93 except: 94 log.error(f"Input file '{input} in bad format") 95 raise ValueError(f"Input file '{input} in bad format") 96 else: 97 self.input = input 98 99 # Input format 100 if input: 101 input_name, input_extension = os.path.splitext(self.input) 102 self.input_name = input_name 103 self.input_extension = input_extension 104 self.input_format = self.input_extension.replace(".", "") 105 106 def set_config(self, config: dict) -> None: 107 """ 108 The set_config function takes a config object and assigns it as the configuration object for the 109 class. 110 111 :param config: The `config` parameter in the `set_config` function is a dictionary object that 112 contains configuration settings for the class. When you call the `set_config` function with a 113 dictionary object as the argument, it will set that dictionary as the configuration object for 114 the class 115 :type config: dict 116 """ 117 118 self.config = config 119 120 def set_param(self, param: dict) -> None: 121 """ 122 This function sets a parameter object for the class based on the input dictionary. 123 124 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 125 as the `param` attribute of the class instance 126 :type param: dict 127 """ 128 129 self.param = param 130 131 def init_variables(self) -> None: 132 """ 133 This function initializes the variables that will be used in the rest of the class 134 """ 135 136 self.prefix = "howard" 137 self.table_variants = "variants" 138 self.dataframe = None 139 140 self.comparison_map = { 141 "gt": ">", 142 "gte": ">=", 143 "lt": "<", 144 "lte": "<=", 145 "equals": "=", 146 "contains": "SIMILAR TO", 147 } 148 149 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 150 151 self.code_type_map_to_sql = { 152 "Integer": "INTEGER", 153 "String": "VARCHAR", 154 "Float": "FLOAT", 155 "Flag": "VARCHAR", 156 } 157 158 self.index_additionnal_fields = [] 159 160 def get_indexing(self) -> bool: 161 """ 162 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 163 returns False. 164 :return: The value of the indexing parameter. 165 """ 166 167 return self.get_param().get("indexing", False) 168 169 def get_connexion_config(self) -> dict: 170 """ 171 The function `get_connexion_config` returns a dictionary containing the configuration for a 172 connection, including the number of threads and memory limit. 173 :return: a dictionary containing the configuration for the Connexion library. 174 """ 175 176 # config 177 config = self.get_config() 178 179 # Connexion config 180 connexion_config = {} 181 threads = self.get_threads() 182 183 # Threads 184 if threads: 185 connexion_config["threads"] = threads 186 187 # Memory 188 # if config.get("memory", None): 189 # connexion_config["memory_limit"] = config.get("memory") 190 if self.get_memory(): 191 connexion_config["memory_limit"] = self.get_memory() 192 193 # Temporary directory 194 if config.get("tmp", None): 195 connexion_config["temp_directory"] = config.get("tmp") 196 197 # Access 198 if config.get("access", None): 199 access = config.get("access") 200 if access in ["RO"]: 201 access = "READ_ONLY" 202 elif access in ["RW"]: 203 access = "READ_WRITE" 204 connexion_db = self.get_connexion_db() 205 if connexion_db in ":memory:": 206 access = "READ_WRITE" 207 connexion_config["access_mode"] = access 208 209 return connexion_config 210 211 def get_duckdb_settings(self) -> dict: 212 """ 213 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 214 string. 215 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 216 """ 217 218 # config 219 config = self.get_config() 220 221 # duckdb settings 222 duckdb_settings_dict = {} 223 if config.get("duckdb_settings", None): 224 duckdb_settings = config.get("duckdb_settings") 225 duckdb_settings = full_path(duckdb_settings) 226 # duckdb setting is a file 227 if os.path.exists(duckdb_settings): 228 with open(duckdb_settings) as json_file: 229 duckdb_settings_dict = yaml.safe_load(json_file) 230 # duckdb settings is a string 231 else: 232 duckdb_settings_dict = json.loads(duckdb_settings) 233 234 return duckdb_settings_dict 235 236 def set_connexion_db(self) -> str: 237 """ 238 The function `set_connexion_db` returns the appropriate database connection string based on the 239 input format and connection type. 240 :return: the value of the variable `connexion_db`. 241 """ 242 243 # Default connexion db 244 default_connexion_db = ":memory:" 245 246 # Find connexion db 247 if self.get_input_format() in ["db", "duckdb"]: 248 connexion_db = self.get_input() 249 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 250 connexion_db = default_connexion_db 251 elif self.get_connexion_type() in ["tmpfile"]: 252 tmp_name = tempfile.mkdtemp( 253 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 254 ) 255 connexion_db = f"{tmp_name}/tmp.db" 256 elif self.get_connexion_type() != "": 257 connexion_db = self.get_connexion_type() 258 else: 259 connexion_db = default_connexion_db 260 261 # Set connexion db 262 self.connexion_db = connexion_db 263 264 return connexion_db 265 266 def set_connexion(self, conn) -> None: 267 """ 268 The function `set_connexion` creates a connection to a database, with options for different 269 database formats and settings. 270 271 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 272 database. If a connection is not provided, a new connection to an in-memory database is created. 273 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 274 sqlite 275 """ 276 277 # Connexion db 278 connexion_db = self.set_connexion_db() 279 280 # Connexion config 281 connexion_config = self.get_connexion_config() 282 283 # Connexion format 284 connexion_format = self.get_config().get("connexion_format", "duckdb") 285 # Set connexion format 286 self.connexion_format = connexion_format 287 288 # Connexion 289 if not conn: 290 if connexion_format in ["duckdb"]: 291 conn = duckdb.connect(connexion_db, config=connexion_config) 292 # duckDB settings 293 duckdb_settings = self.get_duckdb_settings() 294 if duckdb_settings: 295 for setting in duckdb_settings: 296 setting_value = duckdb_settings.get(setting) 297 if isinstance(setting_value, str): 298 setting_value = f"'{setting_value}'" 299 conn.execute(f"PRAGMA {setting}={setting_value};") 300 elif connexion_format in ["sqlite"]: 301 conn = sqlite3.connect(connexion_db) 302 303 # Set connexion 304 self.conn = conn 305 306 # Log 307 log.debug(f"connexion_format: {connexion_format}") 308 log.debug(f"connexion_db: {connexion_db}") 309 log.debug(f"connexion config: {connexion_config}") 310 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 311 312 def set_output(self, output: str = None) -> None: 313 """ 314 The `set_output` function in Python sets the output file based on the input or a specified key 315 in the config file, extracting the output name, extension, and format. 316 317 :param output: The `output` parameter in the `set_output` method is used to specify the name of 318 the output file. If the config file has an 'output' key, the method sets the output to the value 319 of that key. If no output is provided, it sets the output to `None` 320 :type output: str 321 """ 322 323 if output and not isinstance(output, str): 324 self.output = output.name 325 else: 326 self.output = output 327 328 # Output format 329 if self.output: 330 output_name, output_extension = os.path.splitext(self.output) 331 self.output_name = output_name 332 self.output_extension = output_extension 333 self.output_format = self.output_extension.replace(".", "") 334 else: 335 self.output_name = None 336 self.output_extension = None 337 self.output_format = None 338 339 def set_header(self) -> None: 340 """ 341 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 342 """ 343 344 input_file = self.get_input() 345 default_header_list = [ 346 "##fileformat=VCFv4.2", 347 "#CHROM POS ID REF ALT QUAL FILTER INFO", 348 ] 349 350 # Full path 351 input_file = full_path(input_file) 352 353 if input_file: 354 355 input_format = self.get_input_format() 356 input_compressed = self.get_input_compressed() 357 config = self.get_config() 358 header_list = default_header_list 359 if input_format in [ 360 "vcf", 361 "hdr", 362 "tsv", 363 "csv", 364 "psv", 365 "parquet", 366 "db", 367 "duckdb", 368 ]: 369 # header provided in param 370 if config.get("header_file", None): 371 with open(config.get("header_file"), "rt") as f: 372 header_list = self.read_vcf_header(f) 373 # within a vcf file format (header within input file itsself) 374 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 375 # within a compressed vcf file format (.vcf.gz) 376 if input_compressed: 377 with bgzf.open(input_file, "rt") as f: 378 header_list = self.read_vcf_header(f) 379 # within an uncompressed vcf file format (.vcf) 380 else: 381 with open(input_file, "rt") as f: 382 header_list = self.read_vcf_header(f) 383 # header provided in default external file .hdr 384 elif os.path.exists((input_file + ".hdr")): 385 with open(input_file + ".hdr", "rt") as f: 386 header_list = self.read_vcf_header(f) 387 else: 388 try: # Try to get header info fields and file columns 389 390 with tempfile.TemporaryDirectory() as tmpdir: 391 392 # Create database 393 db_for_header = Database(database=input_file) 394 395 # Get header columns for infos fields 396 db_header_from_columns = ( 397 db_for_header.get_header_from_columns() 398 ) 399 400 # Get real columns in the file 401 db_header_columns = db_for_header.get_columns() 402 403 # Write header file 404 header_file_tmp = os.path.join(tmpdir, "header") 405 f = open(header_file_tmp, "w") 406 vcf.Writer(f, db_header_from_columns) 407 f.close() 408 409 # Replace #CHROM line with rel columns 410 header_list = db_for_header.read_header_file( 411 header_file=header_file_tmp 412 ) 413 header_list[-1] = "\t".join(db_header_columns) 414 415 except: 416 417 log.warning( 418 f"No header for file {input_file}. Set as default VCF header" 419 ) 420 header_list = default_header_list 421 422 else: # try for unknown format ? 423 424 log.error(f"Input file format '{input_format}' not available") 425 raise ValueError(f"Input file format '{input_format}' not available") 426 427 if not header_list: 428 header_list = default_header_list 429 430 # header as list 431 self.header_list = header_list 432 433 # header as VCF object 434 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 435 436 else: 437 438 self.header_list = None 439 self.header_vcf = None 440 441 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 442 """ 443 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 444 DataFrame based on the connection format. 445 446 :param query: The `query` parameter in the `get_query_to_df` function is a string that 447 represents the SQL query you want to execute. This query will be used to fetch data from a 448 database and convert it into a pandas DataFrame 449 :type query: str 450 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 451 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 452 function will only fetch up to that number of rows from the database query result. If no limit 453 is specified, 454 :type limit: int 455 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 456 """ 457 458 # Connexion format 459 connexion_format = self.get_connexion_format() 460 461 # Limit in query 462 if limit: 463 pd.set_option("display.max_rows", limit) 464 if connexion_format in ["duckdb"]: 465 df = ( 466 self.conn.execute(query) 467 .fetch_record_batch(limit) 468 .read_next_batch() 469 .to_pandas() 470 ) 471 elif connexion_format in ["sqlite"]: 472 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 473 474 # Full query 475 else: 476 if connexion_format in ["duckdb"]: 477 df = self.conn.execute(query).df() 478 elif connexion_format in ["sqlite"]: 479 df = pd.read_sql_query(query, self.conn) 480 481 return df 482 483 def get_overview(self) -> None: 484 """ 485 The function prints the input, output, config, and dataframe of the current object 486 """ 487 table_variants_from = self.get_table_variants(clause="from") 488 sql_columns = self.get_header_columns_as_sql() 489 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 490 df = self.get_query_to_df(sql_query_export) 491 log.info( 492 "Input: " 493 + str(self.get_input()) 494 + " [" 495 + str(str(self.get_input_format())) 496 + "]" 497 ) 498 log.info( 499 "Output: " 500 + str(self.get_output()) 501 + " [" 502 + str(str(self.get_output_format())) 503 + "]" 504 ) 505 log.info("Config: ") 506 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 507 "\n" 508 ): 509 log.info("\t" + str(d)) 510 log.info("Param: ") 511 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 512 "\n" 513 ): 514 log.info("\t" + str(d)) 515 log.info("Sample list: " + str(self.get_header_sample_list())) 516 log.info("Dataframe: ") 517 for d in str(df).split("\n"): 518 log.info("\t" + str(d)) 519 520 # garbage collector 521 del df 522 gc.collect() 523 524 return None 525 526 def get_stats(self) -> dict: 527 """ 528 The `get_stats` function calculates and returns various statistics of the current object, 529 including information about the input file, variants, samples, header fields, quality, and 530 SNVs/InDels. 531 :return: a dictionary containing various statistics of the current object. The dictionary has 532 the following structure: 533 """ 534 535 # Log 536 log.info(f"Stats Calculation...") 537 538 # table varaints 539 table_variants_from = self.get_table_variants() 540 541 # stats dict 542 stats = {"Infos": {}} 543 544 ### File 545 input_file = self.get_input() 546 stats["Infos"]["Input file"] = input_file 547 548 # Header 549 header_infos = self.get_header().infos 550 header_formats = self.get_header().formats 551 header_infos_list = list(header_infos) 552 header_formats_list = list(header_formats) 553 554 ### Variants 555 556 stats["Variants"] = {} 557 558 # Variants by chr 559 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 560 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 561 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 562 by=["CHROM"], kind="quicksort" 563 ) 564 565 # Total number of variants 566 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 567 568 # Calculate percentage 569 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 570 lambda x: (x / nb_of_variants) 571 ) 572 573 stats["Variants"]["Number of variants by chromosome"] = ( 574 nb_of_variants_by_chrom.to_dict(orient="index") 575 ) 576 577 stats["Infos"]["Number of variants"] = int(nb_of_variants) 578 579 ### Samples 580 581 # Init 582 samples = {} 583 nb_of_samples = 0 584 585 # Check Samples 586 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 587 log.debug(f"Check samples...") 588 for sample in self.get_header_sample_list(): 589 sql_query_samples = f""" 590 SELECT '{sample}' as sample, 591 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 592 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 593 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 594 FROM {table_variants_from} 595 WHERE ( 596 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 597 AND 598 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 599 ) 600 GROUP BY genotype 601 """ 602 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 603 sample_genotype_count = sql_query_genotype_df["count"].sum() 604 if len(sql_query_genotype_df): 605 nb_of_samples += 1 606 samples[f"{sample} - {sample_genotype_count} variants"] = ( 607 sql_query_genotype_df.to_dict(orient="index") 608 ) 609 610 stats["Samples"] = samples 611 stats["Infos"]["Number of samples"] = nb_of_samples 612 613 # # 614 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 615 # stats["Infos"]["Number of samples"] = nb_of_samples 616 # elif nb_of_samples: 617 # stats["Infos"]["Number of samples"] = "not a VCF format" 618 619 ### INFO and FORMAT fields 620 header_types_df = {} 621 header_types_list = { 622 "List of INFO fields": header_infos, 623 "List of FORMAT fields": header_formats, 624 } 625 i = 0 626 for header_type in header_types_list: 627 628 header_type_infos = header_types_list.get(header_type) 629 header_infos_dict = {} 630 631 for info in header_type_infos: 632 633 i += 1 634 header_infos_dict[i] = {} 635 636 # ID 637 header_infos_dict[i]["id"] = info 638 639 # num 640 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 641 if header_type_infos[info].num in genotype_map.keys(): 642 header_infos_dict[i]["Number"] = genotype_map.get( 643 header_type_infos[info].num 644 ) 645 else: 646 header_infos_dict[i]["Number"] = header_type_infos[info].num 647 648 # type 649 if header_type_infos[info].type: 650 header_infos_dict[i]["Type"] = header_type_infos[info].type 651 else: 652 header_infos_dict[i]["Type"] = "." 653 654 # desc 655 if header_type_infos[info].desc != None: 656 header_infos_dict[i]["Description"] = header_type_infos[info].desc 657 else: 658 header_infos_dict[i]["Description"] = "" 659 660 if len(header_infos_dict): 661 header_types_df[header_type] = pd.DataFrame.from_dict( 662 header_infos_dict, orient="index" 663 ).to_dict(orient="index") 664 665 # Stats 666 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 667 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 668 stats["Header"] = header_types_df 669 670 ### QUAL 671 if "QUAL" in self.get_header_columns(): 672 sql_query_qual = f""" 673 SELECT 674 avg(CAST(QUAL AS INTEGER)) AS Average, 675 min(CAST(QUAL AS INTEGER)) AS Minimum, 676 max(CAST(QUAL AS INTEGER)) AS Maximum, 677 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 678 median(CAST(QUAL AS INTEGER)) AS Median, 679 variance(CAST(QUAL AS INTEGER)) AS Variance 680 FROM {table_variants_from} 681 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 682 """ 683 684 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 685 stats["Quality"] = {"Stats": qual} 686 687 ### SNV and InDel 688 689 sql_query_snv = f""" 690 691 SELECT Type, count FROM ( 692 693 SELECT 694 'Total' AS Type, 695 count(*) AS count 696 FROM {table_variants_from} 697 698 UNION 699 700 SELECT 701 'MNV' AS Type, 702 count(*) AS count 703 FROM {table_variants_from} 704 WHERE len(REF) > 1 AND len(ALT) > 1 705 AND len(REF) = len(ALT) 706 707 UNION 708 709 SELECT 710 'InDel' AS Type, 711 count(*) AS count 712 FROM {table_variants_from} 713 WHERE len(REF) > 1 OR len(ALT) > 1 714 AND len(REF) != len(ALT) 715 716 UNION 717 718 SELECT 719 'SNV' AS Type, 720 count(*) AS count 721 FROM {table_variants_from} 722 WHERE len(REF) = 1 AND len(ALT) = 1 723 724 ) 725 726 ORDER BY count DESC 727 728 """ 729 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 730 731 sql_query_snv_substitution = f""" 732 SELECT 733 concat(REF, '>', ALT) AS 'Substitution', 734 count(*) AS count 735 FROM {table_variants_from} 736 WHERE len(REF) = 1 AND len(ALT) = 1 737 GROUP BY REF, ALT 738 ORDER BY count(*) DESC 739 """ 740 snv_substitution = ( 741 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 742 ) 743 stats["Variants"]["Counts"] = snv_indel 744 stats["Variants"]["Substitutions"] = snv_substitution 745 746 return stats 747 748 def stats_to_file(self, file: str = None) -> str: 749 """ 750 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 751 into a JSON object, and writes the JSON object to the specified file. 752 753 :param file: The `file` parameter is a string that represents the file path where the JSON data 754 will be written 755 :type file: str 756 :return: the name of the file that was written to. 757 """ 758 759 # Get stats 760 stats = self.get_stats() 761 762 # Serializing json 763 json_object = json.dumps(stats, indent=4) 764 765 # Writing to sample.json 766 with open(file, "w") as outfile: 767 outfile.write(json_object) 768 769 return file 770 771 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 772 """ 773 The `print_stats` function generates a markdown file and prints the statistics contained in a 774 JSON file in a formatted manner. 775 776 :param output_file: The `output_file` parameter is a string that specifies the path and filename 777 of the output file where the stats will be printed in Markdown format. If no `output_file` is 778 provided, a temporary directory will be created and the stats will be saved in a file named 779 "stats.md" within that 780 :type output_file: str 781 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 782 file where the statistics will be saved. If no value is provided, a temporary directory will be 783 created and a default file name "stats.json" will be used 784 :type json_file: str 785 :return: The function `print_stats` does not return any value. It has a return type annotation 786 of `None`. 787 """ 788 789 # Full path 790 output_file = full_path(output_file) 791 json_file = full_path(json_file) 792 793 with tempfile.TemporaryDirectory() as tmpdir: 794 795 # Files 796 if not output_file: 797 output_file = os.path.join(tmpdir, "stats.md") 798 if not json_file: 799 json_file = os.path.join(tmpdir, "stats.json") 800 801 # Create folders 802 if not os.path.exists(os.path.dirname(output_file)): 803 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 804 if not os.path.exists(os.path.dirname(json_file)): 805 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 806 807 # Create stats JSON file 808 stats_file = self.stats_to_file(file=json_file) 809 810 # Print stats file 811 with open(stats_file) as f: 812 stats = yaml.safe_load(f) 813 814 # Output 815 output_title = [] 816 output_index = [] 817 output = [] 818 819 # Title 820 output_title.append("# HOWARD Stats") 821 822 # Index 823 output_index.append("## Index") 824 825 # Process sections 826 for section in stats: 827 infos = stats.get(section) 828 section_link = "#" + section.lower().replace(" ", "-") 829 output.append(f"## {section}") 830 output_index.append(f"- [{section}]({section_link})") 831 832 if len(infos): 833 for info in infos: 834 try: 835 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 836 is_df = True 837 except: 838 try: 839 df = pd.DataFrame.from_dict( 840 json.loads((infos.get(info))), orient="index" 841 ) 842 is_df = True 843 except: 844 is_df = False 845 if is_df: 846 output.append(f"### {info}") 847 info_link = "#" + info.lower().replace(" ", "-") 848 output_index.append(f" - [{info}]({info_link})") 849 output.append(f"{df.to_markdown(index=False)}") 850 else: 851 output.append(f"- {info}: {infos.get(info)}") 852 else: 853 output.append(f"NA") 854 855 # Write stats in markdown file 856 with open(output_file, "w") as fp: 857 for item in output_title: 858 fp.write("%s\n" % item) 859 for item in output_index: 860 fp.write("%s\n" % item) 861 for item in output: 862 fp.write("%s\n" % item) 863 864 # Output stats in markdown 865 print("") 866 print("\n\n".join(output_title)) 867 print("") 868 print("\n\n".join(output)) 869 print("") 870 871 return None 872 873 def get_input(self) -> str: 874 """ 875 It returns the value of the input variable. 876 :return: The input is being returned. 877 """ 878 return self.input 879 880 def get_input_format(self, input_file: str = None) -> str: 881 """ 882 This function returns the format of the input variable, either from the provided input file or 883 by prompting for input. 884 885 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 886 represents the file path of the input file. If no `input_file` is provided when calling the 887 method, it will default to `None` 888 :type input_file: str 889 :return: The format of the input variable is being returned. 890 """ 891 892 if not input_file: 893 input_file = self.get_input() 894 input_format = get_file_format(input_file) 895 return input_format 896 897 def get_input_compressed(self, input_file: str = None) -> str: 898 """ 899 The function `get_input_compressed` returns the format of the input variable after compressing 900 it. 901 902 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 903 that represents the file path of the input file. If no `input_file` is provided when calling the 904 method, it will default to `None` and the method will then call `self.get_input()` to 905 :type input_file: str 906 :return: The function `get_input_compressed` returns the compressed format of the input 907 variable. 908 """ 909 910 if not input_file: 911 input_file = self.get_input() 912 input_compressed = get_file_compressed(input_file) 913 return input_compressed 914 915 def get_output(self) -> str: 916 """ 917 It returns the output of the neuron. 918 :return: The output of the neural network. 919 """ 920 921 return self.output 922 923 def get_output_format(self, output_file: str = None) -> str: 924 """ 925 The function `get_output_format` returns the format of the input variable or the output file if 926 provided. 927 928 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 929 that represents the file path of the output file. If no `output_file` is provided when calling 930 the method, it will default to the output obtained from the `get_output` method of the class 931 instance. The 932 :type output_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not output_file: 937 output_file = self.get_output() 938 output_format = get_file_format(output_file) 939 940 return output_format 941 942 def get_config(self) -> dict: 943 """ 944 It returns the config 945 :return: The config variable is being returned. 946 """ 947 return self.config 948 949 def get_param(self) -> dict: 950 """ 951 It returns the param 952 :return: The param variable is being returned. 953 """ 954 return self.param 955 956 def get_connexion_db(self) -> str: 957 """ 958 It returns the connexion_db attribute of the object 959 :return: The connexion_db is being returned. 960 """ 961 return self.connexion_db 962 963 def get_prefix(self) -> str: 964 """ 965 It returns the prefix of the object. 966 :return: The prefix is being returned. 967 """ 968 return self.prefix 969 970 def get_table_variants(self, clause: str = "select") -> str: 971 """ 972 This function returns the table_variants attribute of the object 973 974 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 975 defaults to select (optional) 976 :return: The table_variants attribute of the object. 977 """ 978 979 # Access 980 access = self.get_config().get("access", None) 981 982 # Clauses "select", "where", "update" 983 if clause in ["select", "where", "update"]: 984 table_variants = self.table_variants 985 # Clause "from" 986 elif clause in ["from"]: 987 # For Read Only 988 if self.get_input_format() in ["parquet"] and access in ["RO"]: 989 input_file = self.get_input() 990 table_variants = f"'{input_file}' as variants" 991 # For Read Write 992 else: 993 table_variants = f"{self.table_variants} as variants" 994 else: 995 table_variants = self.table_variants 996 return table_variants 997 998 def get_tmp_dir(self) -> str: 999 """ 1000 The function `get_tmp_dir` returns the temporary directory path based on configuration 1001 parameters or a default path. 1002 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1003 configuration, parameters, and a default value of "/tmp". 1004 """ 1005 1006 return get_tmp( 1007 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1008 ) 1009 1010 def get_connexion_type(self) -> str: 1011 """ 1012 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1013 1014 :return: The connexion type is being returned. 1015 """ 1016 return self.get_config().get("connexion_type", "memory") 1017 1018 def get_connexion(self): 1019 """ 1020 It returns the connection object 1021 1022 :return: The connection object. 1023 """ 1024 return self.conn 1025 1026 def close_connexion(self) -> None: 1027 """ 1028 This function closes the connection to the database. 1029 :return: The connection is being closed. 1030 """ 1031 return self.conn.close() 1032 1033 def get_header(self, type: str = "vcf"): 1034 """ 1035 This function returns the header of the VCF file as a list of strings 1036 1037 :param type: the type of header you want to get, defaults to vcf (optional) 1038 :return: The header of the vcf file. 1039 """ 1040 1041 if self.header_vcf: 1042 if type == "vcf": 1043 return self.header_vcf 1044 elif type == "list": 1045 return self.header_list 1046 else: 1047 if type == "vcf": 1048 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1049 return header 1050 elif type == "list": 1051 return vcf_required 1052 1053 def get_header_length(self, file: str = None) -> int: 1054 """ 1055 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1056 line. 1057 1058 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1059 header file. If this argument is provided, the function will read the header from the specified 1060 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1061 :type file: str 1062 :return: the length of the header list, excluding the #CHROM line. 1063 """ 1064 1065 if file: 1066 return len(self.read_vcf_header_file(file=file)) - 1 1067 elif self.get_header(type="list"): 1068 return len(self.get_header(type="list")) - 1 1069 else: 1070 return 0 1071 1072 def get_header_columns(self) -> str: 1073 """ 1074 This function returns the header list of a VCF 1075 1076 :return: The length of the header list. 1077 """ 1078 if self.get_header(): 1079 return self.get_header(type="list")[-1] 1080 else: 1081 return "" 1082 1083 def get_header_columns_as_list(self) -> list: 1084 """ 1085 This function returns the header list of a VCF 1086 1087 :return: The length of the header list. 1088 """ 1089 if self.get_header(): 1090 return self.get_header_columns().strip().split("\t") 1091 else: 1092 return [] 1093 1094 def get_header_columns_as_sql(self) -> str: 1095 """ 1096 This function retruns header length (without #CHROM line) 1097 1098 :return: The length of the header list. 1099 """ 1100 sql_column_list = [] 1101 for col in self.get_header_columns_as_list(): 1102 sql_column_list.append(f'"{col}"') 1103 return ",".join(sql_column_list) 1104 1105 def get_header_sample_list(self) -> list: 1106 """ 1107 This function retruns header length (without #CHROM line) 1108 1109 :return: The length of the header list. 1110 """ 1111 return self.header_vcf.samples 1112 1113 def get_verbose(self) -> bool: 1114 """ 1115 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1116 exist 1117 1118 :return: The value of the key "verbose" in the config dictionary. 1119 """ 1120 return self.get_config().get("verbose", False) 1121 1122 def get_connexion_format(self) -> str: 1123 """ 1124 It returns the connexion format of the object. 1125 :return: The connexion_format is being returned. 1126 """ 1127 connexion_format = self.connexion_format 1128 if connexion_format not in ["duckdb", "sqlite"]: 1129 log.error(f"Unknown connexion format {connexion_format}") 1130 raise ValueError(f"Unknown connexion format {connexion_format}") 1131 else: 1132 return connexion_format 1133 1134 def insert_file_to_table( 1135 self, 1136 file, 1137 columns: str, 1138 header_len: int = 0, 1139 sep: str = "\t", 1140 chunksize: int = 1000000, 1141 ) -> None: 1142 """ 1143 The function reads a file in chunks and inserts each chunk into a table based on the specified 1144 database format. 1145 1146 :param file: The `file` parameter is the file that you want to load into a table. It should be 1147 the path to the file on your system 1148 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1149 should contain the names of the columns in the table where the data will be inserted. The column 1150 names should be separated by commas within the string. For example, if you have columns named 1151 "id", "name 1152 :type columns: str 1153 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1154 the number of lines to skip at the beginning of the file before reading the actual data. This 1155 parameter allows you to skip any header information present in the file before processing the 1156 data, defaults to 0 1157 :type header_len: int (optional) 1158 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1159 separator character that is used in the file being read. In this case, the default separator is 1160 set to `\t`, which represents a tab character. You can change this parameter to a different 1161 separator character if, defaults to \t 1162 :type sep: str (optional) 1163 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1164 when processing the file in chunks. In the provided code snippet, the default value for 1165 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1166 to 1000000 1167 :type chunksize: int (optional) 1168 """ 1169 1170 # Config 1171 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1172 connexion_format = self.get_connexion_format() 1173 1174 log.debug("chunksize: " + str(chunksize)) 1175 1176 if chunksize: 1177 for chunk in pd.read_csv( 1178 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1179 ): 1180 if connexion_format in ["duckdb"]: 1181 sql_insert_into = ( 1182 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1183 ) 1184 self.conn.execute(sql_insert_into) 1185 elif connexion_format in ["sqlite"]: 1186 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1187 1188 def load_data( 1189 self, 1190 input_file: str = None, 1191 drop_variants_table: bool = False, 1192 sample_size: int = 20480, 1193 ) -> None: 1194 """ 1195 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1196 table before loading the data and specify a sample size. 1197 1198 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1199 table 1200 :type input_file: str 1201 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1202 determines whether the variants table should be dropped before loading the data. If set to 1203 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1204 not be dropped, defaults to False 1205 :type drop_variants_table: bool (optional) 1206 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1207 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1208 20480 1209 :type sample_size: int (optional) 1210 """ 1211 1212 log.info("Loading...") 1213 1214 # change input file 1215 if input_file: 1216 self.set_input(input_file) 1217 self.set_header() 1218 1219 # drop variants table 1220 if drop_variants_table: 1221 self.drop_variants_table() 1222 1223 # get table variants 1224 table_variants = self.get_table_variants() 1225 1226 # Access 1227 access = self.get_config().get("access", None) 1228 log.debug(f"access: {access}") 1229 1230 # Input format and compress 1231 input_format = self.get_input_format() 1232 input_compressed = self.get_input_compressed() 1233 log.debug(f"input_format: {input_format}") 1234 log.debug(f"input_compressed: {input_compressed}") 1235 1236 # input_compressed_format 1237 if input_compressed: 1238 input_compressed_format = "gzip" 1239 else: 1240 input_compressed_format = "none" 1241 log.debug(f"input_compressed_format: {input_compressed_format}") 1242 1243 # Connexion format 1244 connexion_format = self.get_connexion_format() 1245 1246 # Sample size 1247 if not sample_size: 1248 sample_size = -1 1249 log.debug(f"sample_size: {sample_size}") 1250 1251 # Load data 1252 log.debug(f"Load Data from {input_format}") 1253 1254 # DuckDB connexion 1255 if connexion_format in ["duckdb"]: 1256 1257 # Database already exists 1258 if self.input_format in ["db", "duckdb"]: 1259 1260 if connexion_format in ["duckdb"]: 1261 log.debug(f"Input file format '{self.input_format}' duckDB") 1262 else: 1263 log.error( 1264 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1265 ) 1266 raise ValueError( 1267 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1268 ) 1269 1270 # Load from existing database format 1271 else: 1272 1273 try: 1274 # Create Table or View 1275 database = Database(database=self.input) 1276 sql_from = database.get_sql_from(sample_size=sample_size) 1277 1278 if access in ["RO"]: 1279 sql_load = ( 1280 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1281 ) 1282 else: 1283 sql_load = ( 1284 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1285 ) 1286 self.conn.execute(sql_load) 1287 1288 except: 1289 # Format not available 1290 log.error(f"Input file format '{self.input_format}' not available") 1291 raise ValueError( 1292 f"Input file format '{self.input_format}' not available" 1293 ) 1294 1295 # SQLite connexion 1296 elif connexion_format in ["sqlite"] and input_format in [ 1297 "vcf", 1298 "tsv", 1299 "csv", 1300 "psv", 1301 ]: 1302 1303 # Main structure 1304 structure = { 1305 "#CHROM": "VARCHAR", 1306 "POS": "INTEGER", 1307 "ID": "VARCHAR", 1308 "REF": "VARCHAR", 1309 "ALT": "VARCHAR", 1310 "QUAL": "VARCHAR", 1311 "FILTER": "VARCHAR", 1312 "INFO": "VARCHAR", 1313 } 1314 1315 # Strcuture with samples 1316 structure_complete = structure 1317 if self.get_header_sample_list(): 1318 structure["FORMAT"] = "VARCHAR" 1319 for sample in self.get_header_sample_list(): 1320 structure_complete[sample] = "VARCHAR" 1321 1322 # Columns list for create and insert 1323 sql_create_table_columns = [] 1324 sql_create_table_columns_list = [] 1325 for column in structure_complete: 1326 column_type = structure_complete[column] 1327 sql_create_table_columns.append( 1328 f'"{column}" {column_type} default NULL' 1329 ) 1330 sql_create_table_columns_list.append(f'"{column}"') 1331 1332 # Create database 1333 log.debug(f"Create Table {table_variants}") 1334 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1335 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1336 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1337 self.conn.execute(sql_create_table) 1338 1339 # chunksize define length of file chunk load file 1340 chunksize = 100000 1341 1342 # delimiter 1343 delimiter = file_format_delimiters.get(input_format, "\t") 1344 1345 # Load the input file 1346 with open(self.input, "rt") as input_file: 1347 1348 # Use the appropriate file handler based on the input format 1349 if input_compressed: 1350 input_file = bgzf.open(self.input, "rt") 1351 if input_format in ["vcf"]: 1352 header_len = self.get_header_length() 1353 else: 1354 header_len = 0 1355 1356 # Insert the file contents into a table 1357 self.insert_file_to_table( 1358 input_file, 1359 columns=sql_create_table_columns_list_sql, 1360 header_len=header_len, 1361 sep=delimiter, 1362 chunksize=chunksize, 1363 ) 1364 1365 else: 1366 log.error( 1367 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1368 ) 1369 raise ValueError( 1370 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1371 ) 1372 1373 # Explode INFOS fields into table fields 1374 if self.get_explode_infos(): 1375 self.explode_infos( 1376 prefix=self.get_explode_infos_prefix(), 1377 fields=self.get_explode_infos_fields(), 1378 force=True, 1379 ) 1380 1381 # Create index after insertion 1382 self.create_indexes() 1383 1384 def get_explode_infos(self) -> bool: 1385 """ 1386 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1387 to False if it is not set. 1388 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1389 value. If the parameter is not present, it will return False. 1390 """ 1391 1392 return self.get_param().get("explode", {}).get("explode_infos", False) 1393 1394 def get_explode_infos_fields( 1395 self, 1396 explode_infos_fields: str = None, 1397 remove_fields_not_in_header: bool = False, 1398 ) -> list: 1399 """ 1400 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1401 the input parameter `explode_infos_fields`. 1402 1403 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1404 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1405 comma-separated list of field names to explode 1406 :type explode_infos_fields: str 1407 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1408 flag that determines whether to remove fields that are not present in the header. If it is set 1409 to `True`, any field that is not in the header will be excluded from the list of exploded 1410 information fields. If it is set to `, defaults to False 1411 :type remove_fields_not_in_header: bool (optional) 1412 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1413 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1414 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1415 Otherwise, it returns a list of exploded information fields after removing any spaces and 1416 splitting the string by commas. 1417 """ 1418 1419 # If no fields, get it in param 1420 if not explode_infos_fields: 1421 explode_infos_fields = ( 1422 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1423 ) 1424 1425 # If no fields, defined as all fields in header using keyword 1426 if not explode_infos_fields: 1427 explode_infos_fields = "*" 1428 1429 # If fields list not empty 1430 if explode_infos_fields: 1431 1432 # Input fields list 1433 if isinstance(explode_infos_fields, str): 1434 fields_input = explode_infos_fields.split(",") 1435 elif isinstance(explode_infos_fields, list): 1436 fields_input = explode_infos_fields 1437 else: 1438 fields_input = [] 1439 1440 # Fields list without * keyword 1441 fields_without_all = fields_input.copy() 1442 if "*".casefold() in (item.casefold() for item in fields_without_all): 1443 fields_without_all.remove("*") 1444 1445 # Fields in header 1446 fields_in_header = sorted(list(set(self.get_header().infos))) 1447 1448 # Construct list of fields 1449 fields_output = [] 1450 for field in fields_input: 1451 1452 # Strip field 1453 field = field.strip() 1454 1455 # format keyword * in regex 1456 if field.upper() in ["*"]: 1457 field = ".*" 1458 1459 # Find all fields with pattern 1460 r = re.compile(field) 1461 fields_search = sorted(list(filter(r.match, fields_in_header))) 1462 1463 # Remove fields input from search 1464 if fields_search != [field]: 1465 fields_search = sorted( 1466 list(set(fields_search).difference(fields_input)) 1467 ) 1468 1469 # If field is not in header (avoid not well formatted header) 1470 if not fields_search and not remove_fields_not_in_header: 1471 fields_search = [field] 1472 1473 # Add found fields 1474 for new_field in fields_search: 1475 # Add field, if not already exists, and if it is in header (if asked) 1476 if ( 1477 new_field not in fields_output 1478 and ( 1479 not remove_fields_not_in_header 1480 or new_field in fields_in_header 1481 ) 1482 and new_field not in [".*"] 1483 ): 1484 fields_output.append(new_field) 1485 1486 return fields_output 1487 1488 else: 1489 1490 return [] 1491 1492 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1493 """ 1494 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1495 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1496 not provided. 1497 1498 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1499 prefix to be used for exploding or expanding information 1500 :type explode_infos_prefix: str 1501 :return: the value of the variable `explode_infos_prefix`. 1502 """ 1503 1504 if not explode_infos_prefix: 1505 explode_infos_prefix = ( 1506 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1507 ) 1508 1509 return explode_infos_prefix 1510 1511 def add_column( 1512 self, 1513 table_name, 1514 column_name, 1515 column_type, 1516 default_value=None, 1517 drop: bool = False, 1518 ) -> dict: 1519 """ 1520 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1521 doesn't already exist. 1522 1523 :param table_name: The name of the table to which you want to add a column 1524 :param column_name: The parameter "column_name" is the name of the column that you want to add 1525 to the table 1526 :param column_type: The `column_type` parameter specifies the data type of the column that you 1527 want to add to the table. It should be a string that represents the desired data type, such as 1528 "INTEGER", "TEXT", "REAL", etc 1529 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1530 default value for the newly added column. If a default value is provided, it will be assigned to 1531 the column for any existing rows that do not have a value for that column 1532 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1533 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1534 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1535 to False 1536 :type drop: bool (optional) 1537 :return: a boolean value indicating whether the column was successfully added to the table. 1538 """ 1539 1540 # added 1541 added = False 1542 dropped = False 1543 1544 # Check if the column already exists in the table 1545 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1546 columns = self.get_query_to_df(query).columns.tolist() 1547 if column_name in columns: 1548 log.debug( 1549 f"The {column_name} column already exists in the {table_name} table" 1550 ) 1551 if drop: 1552 self.drop_column(table_name=table_name, column_name=column_name) 1553 dropped = True 1554 else: 1555 return None 1556 else: 1557 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1558 1559 # Add column in table 1560 add_column_query = ( 1561 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1562 ) 1563 if default_value is not None: 1564 add_column_query += f" DEFAULT {default_value}" 1565 self.execute_query(add_column_query) 1566 added = not dropped 1567 log.debug( 1568 f"The {column_name} column was successfully added to the {table_name} table" 1569 ) 1570 1571 if added: 1572 added_column = { 1573 "table_name": table_name, 1574 "column_name": column_name, 1575 "column_type": column_type, 1576 "default_value": default_value, 1577 } 1578 else: 1579 added_column = None 1580 1581 return added_column 1582 1583 def drop_column( 1584 self, column: dict = None, table_name: str = None, column_name: str = None 1585 ) -> bool: 1586 """ 1587 The `drop_column` function drops a specified column from a given table in a database and returns 1588 True if the column was successfully dropped, and False if the column does not exist in the 1589 table. 1590 1591 :param column: The `column` parameter is a dictionary that contains information about the column 1592 you want to drop. It has two keys: 1593 :type column: dict 1594 :param table_name: The `table_name` parameter is the name of the table from which you want to 1595 drop a column 1596 :type table_name: str 1597 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1598 from the table 1599 :type column_name: str 1600 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1601 and False if the column does not exist in the table. 1602 """ 1603 1604 # Find column infos 1605 if column: 1606 if isinstance(column, dict): 1607 table_name = column.get("table_name", None) 1608 column_name = column.get("column_name", None) 1609 elif isinstance(column, str): 1610 table_name = self.get_table_variants() 1611 column_name = column 1612 else: 1613 table_name = None 1614 column_name = None 1615 1616 if not table_name and not column_name: 1617 return False 1618 1619 # Removed 1620 removed = False 1621 1622 # Check if the column already exists in the table 1623 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1624 columns = self.get_query_to_df(query).columns.tolist() 1625 if column_name in columns: 1626 log.debug(f"The {column_name} column exists in the {table_name} table") 1627 else: 1628 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1629 return False 1630 1631 # Add column in table # ALTER TABLE integers DROP k 1632 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1633 self.execute_query(add_column_query) 1634 removed = True 1635 log.debug( 1636 f"The {column_name} column was successfully dropped to the {table_name} table" 1637 ) 1638 1639 return removed 1640 1641 def explode_infos( 1642 self, 1643 prefix: str = None, 1644 create_index: bool = False, 1645 fields: list = None, 1646 force: bool = False, 1647 proccess_all_fields_together: bool = False, 1648 ) -> list: 1649 """ 1650 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1651 columns, returning a list of added columns. 1652 1653 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1654 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1655 `self.get_explode_infos_prefix()` as the prefix 1656 :type prefix: str 1657 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1658 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1659 `False`, indexes will not be created. The default value is `False`, defaults to False 1660 :type create_index: bool (optional) 1661 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1662 individual columns. If this parameter is not provided, all INFO fields will be exploded 1663 :type fields: list 1664 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1665 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1666 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1667 defaults to False 1668 :type force: bool (optional) 1669 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1670 flag that determines whether to process all the INFO fields together or individually. If set to 1671 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1672 be processed individually, defaults to False 1673 :type proccess_all_fields_together: bool (optional) 1674 :return: The function `explode_infos` returns a list of added columns. 1675 """ 1676 1677 # drop indexes 1678 self.drop_indexes() 1679 1680 # connexion format 1681 connexion_format = self.get_connexion_format() 1682 1683 # Access 1684 access = self.get_config().get("access", None) 1685 1686 # Added columns 1687 added_columns = [] 1688 1689 if access not in ["RO"]: 1690 1691 # prefix 1692 if prefix in [None, True] or not isinstance(prefix, str): 1693 if self.get_explode_infos_prefix() not in [None, True]: 1694 prefix = self.get_explode_infos_prefix() 1695 else: 1696 prefix = "INFO/" 1697 1698 # table variants 1699 table_variants = self.get_table_variants(clause="select") 1700 1701 # extra infos 1702 try: 1703 extra_infos = self.get_extra_infos() 1704 except: 1705 extra_infos = [] 1706 1707 # Header infos 1708 header_infos = self.get_header().infos 1709 1710 log.debug( 1711 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1712 ) 1713 1714 sql_info_alter_table_array = [] 1715 1716 # Info fields to check 1717 fields_list = list(header_infos) 1718 if fields: 1719 fields_list += fields 1720 fields_list = set(fields_list) 1721 1722 # If no fields 1723 if not fields: 1724 fields = [] 1725 1726 # Translate fields if patterns 1727 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1728 1729 for info in fields: 1730 1731 info_id_sql = prefix + info 1732 1733 if ( 1734 info in fields_list 1735 or prefix + info in fields_list 1736 or info in extra_infos 1737 ): 1738 1739 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1740 1741 if info in header_infos: 1742 info_type = header_infos[info].type 1743 info_num = header_infos[info].num 1744 else: 1745 info_type = "String" 1746 info_num = 0 1747 1748 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1749 if info_num != 1: 1750 type_sql = "VARCHAR" 1751 1752 # Add field 1753 added_column = self.add_column( 1754 table_name=table_variants, 1755 column_name=info_id_sql, 1756 column_type=type_sql, 1757 default_value="null", 1758 drop=force, 1759 ) 1760 1761 if added_column: 1762 added_columns.append(added_column) 1763 1764 if added_column or force: 1765 1766 # add field to index 1767 self.index_additionnal_fields.append(info_id_sql) 1768 1769 # Update field array 1770 if connexion_format in ["duckdb"]: 1771 update_info_field = f""" 1772 "{info_id_sql}" = 1773 CASE 1774 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1775 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1776 END 1777 """ 1778 elif connexion_format in ["sqlite"]: 1779 update_info_field = f""" 1780 "{info_id_sql}" = 1781 CASE 1782 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1783 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1784 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1785 END 1786 """ 1787 1788 sql_info_alter_table_array.append(update_info_field) 1789 1790 if sql_info_alter_table_array: 1791 1792 # By chromosomes 1793 try: 1794 chromosomes_list = list( 1795 self.get_query_to_df( 1796 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1797 )["#CHROM"] 1798 ) 1799 except: 1800 chromosomes_list = [None] 1801 1802 for chrom in chromosomes_list: 1803 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1804 1805 # Where clause 1806 where_clause = "" 1807 if chrom and len(chromosomes_list) > 1: 1808 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1809 1810 # Update table 1811 if proccess_all_fields_together: 1812 sql_info_alter_table_array_join = ", ".join( 1813 sql_info_alter_table_array 1814 ) 1815 if sql_info_alter_table_array_join: 1816 sql_info_alter_table = f""" 1817 UPDATE {table_variants} 1818 SET {sql_info_alter_table_array_join} 1819 {where_clause} 1820 """ 1821 log.debug( 1822 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1823 ) 1824 # log.debug(sql_info_alter_table) 1825 self.conn.execute(sql_info_alter_table) 1826 else: 1827 sql_info_alter_num = 0 1828 for sql_info_alter in sql_info_alter_table_array: 1829 sql_info_alter_num += 1 1830 sql_info_alter_table = f""" 1831 UPDATE {table_variants} 1832 SET {sql_info_alter} 1833 {where_clause} 1834 """ 1835 log.debug( 1836 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1837 ) 1838 # log.debug(sql_info_alter_table) 1839 self.conn.execute(sql_info_alter_table) 1840 1841 # create indexes 1842 if create_index: 1843 self.create_indexes() 1844 1845 return added_columns 1846 1847 def create_indexes(self) -> None: 1848 """ 1849 Create indexes on the table after insertion 1850 """ 1851 1852 # Access 1853 access = self.get_config().get("access", None) 1854 1855 # get table variants 1856 table_variants = self.get_table_variants("FROM") 1857 1858 if self.get_indexing() and access not in ["RO"]: 1859 # Create index 1860 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1861 self.conn.execute(sql_create_table_index) 1862 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1863 self.conn.execute(sql_create_table_index) 1864 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1865 self.conn.execute(sql_create_table_index) 1866 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1867 self.conn.execute(sql_create_table_index) 1868 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1869 self.conn.execute(sql_create_table_index) 1870 for field in self.index_additionnal_fields: 1871 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1872 self.conn.execute(sql_create_table_index) 1873 1874 def drop_indexes(self) -> None: 1875 """ 1876 Create indexes on the table after insertion 1877 """ 1878 1879 # Access 1880 access = self.get_config().get("access", None) 1881 1882 # get table variants 1883 table_variants = self.get_table_variants("FROM") 1884 1885 # Get database format 1886 connexion_format = self.get_connexion_format() 1887 1888 if access not in ["RO"]: 1889 if connexion_format in ["duckdb"]: 1890 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1891 elif connexion_format in ["sqlite"]: 1892 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1893 1894 list_indexes = self.conn.execute(sql_list_indexes) 1895 index_names = [row[0] for row in list_indexes.fetchall()] 1896 for index in index_names: 1897 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1898 self.conn.execute(sql_drop_table_index) 1899 1900 def read_vcf_header(self, f) -> list: 1901 """ 1902 It reads the header of a VCF file and returns a list of the header lines 1903 1904 :param f: the file object 1905 :return: The header lines of the VCF file. 1906 """ 1907 1908 header_list = [] 1909 for line in f: 1910 header_list.append(line) 1911 if line.startswith("#CHROM"): 1912 break 1913 return header_list 1914 1915 def read_vcf_header_file(self, file: str = None) -> list: 1916 """ 1917 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1918 uncompressed files. 1919 1920 :param file: The `file` parameter is a string that represents the path to the VCF header file 1921 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1922 default to `None` 1923 :type file: str 1924 :return: The function `read_vcf_header_file` returns a list. 1925 """ 1926 1927 if self.get_input_compressed(input_file=file): 1928 with bgzf.open(file, "rt") as f: 1929 return self.read_vcf_header(f=f) 1930 else: 1931 with open(file, "rt") as f: 1932 return self.read_vcf_header(f=f) 1933 1934 def execute_query(self, query: str): 1935 """ 1936 It takes a query as an argument, executes it, and returns the results 1937 1938 :param query: The query to be executed 1939 :return: The result of the query is being returned. 1940 """ 1941 if query: 1942 return self.conn.execute(query) # .fetchall() 1943 else: 1944 return None 1945 1946 def export_output( 1947 self, 1948 output_file: str | None = None, 1949 output_header: str | None = None, 1950 export_header: bool = True, 1951 query: str | None = None, 1952 parquet_partitions: list | None = None, 1953 chunk_size: int | None = None, 1954 threads: int | None = None, 1955 sort: bool = False, 1956 index: bool = False, 1957 order_by: str | None = None, 1958 ) -> bool: 1959 """ 1960 The `export_output` function exports data from a VCF file to a specified output file in various 1961 formats, including VCF, CSV, TSV, PSV, and Parquet. 1962 1963 :param output_file: The `output_file` parameter is a string that specifies the name of the 1964 output file to be generated by the function. This is where the exported data will be saved 1965 :type output_file: str 1966 :param output_header: The `output_header` parameter is a string that specifies the name of the 1967 file where the header of the VCF file will be exported. If this parameter is not provided, the 1968 header will be exported to a file with the same name as the `output_file` parameter, but with 1969 the extension " 1970 :type output_header: str 1971 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1972 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1973 True, the header will be exported to a file. If `export_header` is False, the header will not 1974 be, defaults to True, if output format is not VCF 1975 :type export_header: bool (optional) 1976 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1977 select specific data from the VCF file before exporting it. If provided, only the data that 1978 matches the query will be exported 1979 :type query: str 1980 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1981 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1982 organize data in a hierarchical directory structure based on the values of one or more columns. 1983 This can improve query performance when working with large datasets 1984 :type parquet_partitions: list 1985 :param chunk_size: The `chunk_size` parameter specifies the number of 1986 records in batch when exporting data in Parquet format. This parameter is used for 1987 partitioning the Parquet file into multiple files. 1988 :type chunk_size: int 1989 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1990 threads to be used during the export process. It determines the level of parallelism and can 1991 improve the performance of the export operation. If not provided, the function will use the 1992 default number of threads 1993 :type threads: int 1994 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1995 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1996 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1997 False 1998 :type sort: bool (optional) 1999 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2000 created on the output file. If `index` is True, an index will be created. If `index` is False, 2001 no index will be created. The default value is False, defaults to False 2002 :type index: bool (optional) 2003 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2004 sorting the output file. This parameter is only applicable when exporting data in VCF format 2005 :type order_by: str 2006 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2007 None if it doesn't. 2008 """ 2009 2010 # Log 2011 log.info("Exporting...") 2012 2013 # Full path 2014 output_file = full_path(output_file) 2015 output_header = full_path(output_header) 2016 2017 # Config 2018 config = self.get_config() 2019 2020 # Param 2021 param = self.get_param() 2022 2023 # Tmp files to remove 2024 tmp_to_remove = [] 2025 2026 # If no output, get it 2027 if not output_file: 2028 output_file = self.get_output() 2029 2030 # If not threads 2031 if not threads: 2032 threads = self.get_threads() 2033 2034 # Auto header name with extension 2035 if export_header or output_header: 2036 if not output_header: 2037 output_header = f"{output_file}.hdr" 2038 # Export header 2039 self.export_header(output_file=output_file) 2040 2041 # Switch off export header if VCF output 2042 output_file_type = get_file_format(output_file) 2043 if output_file_type in ["vcf"]: 2044 export_header = False 2045 tmp_to_remove.append(output_header) 2046 2047 # Chunk size 2048 if not chunk_size: 2049 chunk_size = config.get("chunk_size", None) 2050 2051 # Parquet partition 2052 if not parquet_partitions: 2053 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2054 if parquet_partitions and isinstance(parquet_partitions, str): 2055 parquet_partitions = parquet_partitions.split(",") 2056 2057 # Order by 2058 if not order_by: 2059 order_by = param.get("export", {}).get("order_by", "") 2060 2061 # Header in output 2062 header_in_output = param.get("export", {}).get("include_header", False) 2063 2064 # Database 2065 database_source = self.get_connexion() 2066 2067 # Connexion format 2068 connexion_format = self.get_connexion_format() 2069 2070 # Explode infos 2071 if self.get_explode_infos(): 2072 self.explode_infos( 2073 prefix=self.get_explode_infos_prefix(), 2074 fields=self.get_explode_infos_fields(), 2075 force=False, 2076 ) 2077 2078 # if connexion_format in ["sqlite"] or query: 2079 if connexion_format in ["sqlite"]: 2080 2081 # Export in Parquet 2082 random_tmp = "".join( 2083 random.choice(string.ascii_lowercase) for i in range(10) 2084 ) 2085 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2086 tmp_to_remove.append(database_source) 2087 2088 # Table Variants 2089 table_variants = self.get_table_variants() 2090 2091 # Create export query 2092 sql_query_export_subquery = f""" 2093 SELECT * FROM {table_variants} 2094 """ 2095 2096 # Write source file 2097 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2098 2099 # Create database 2100 database = Database( 2101 database=database_source, 2102 table="variants", 2103 header_file=output_header, 2104 conn_config=self.get_connexion_config(), 2105 ) 2106 2107 # Existing colomns header 2108 # existing_columns_header = database.get_header_file_columns(output_header) 2109 existing_columns_header = database.get_header_columns_from_database() 2110 2111 # Export file 2112 database.export( 2113 output_database=output_file, 2114 output_header=output_header, 2115 existing_columns_header=existing_columns_header, 2116 parquet_partitions=parquet_partitions, 2117 chunk_size=chunk_size, 2118 threads=threads, 2119 sort=sort, 2120 index=index, 2121 header_in_output=header_in_output, 2122 order_by=order_by, 2123 query=query, 2124 export_header=export_header, 2125 ) 2126 2127 # Remove 2128 remove_if_exists(tmp_to_remove) 2129 2130 return (os.path.exists(output_file) or None) and ( 2131 os.path.exists(output_file) or None 2132 ) 2133 2134 def get_extra_infos(self, table: str = None) -> list: 2135 """ 2136 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2137 in the header. 2138 2139 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2140 name of the table from which you want to retrieve the extra columns that are not present in the 2141 header. If the `table` parameter is not provided when calling the function, it will default to 2142 using the variants 2143 :type table: str 2144 :return: A list of columns that are in the specified table but not in the header of the table. 2145 """ 2146 2147 header_columns = [] 2148 2149 if not table: 2150 table = self.get_table_variants(clause="from") 2151 header_columns = self.get_header_columns() 2152 2153 # Check all columns in the database 2154 query = f""" SELECT * FROM {table} LIMIT 1 """ 2155 log.debug(f"query {query}") 2156 table_columns = self.get_query_to_df(query).columns.tolist() 2157 extra_columns = [] 2158 2159 # Construct extra infos (not in header) 2160 for column in table_columns: 2161 if column not in header_columns: 2162 extra_columns.append(column) 2163 2164 return extra_columns 2165 2166 def get_extra_infos_sql(self, table: str = None) -> str: 2167 """ 2168 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2169 by double quotes 2170 2171 :param table: The name of the table to get the extra infos from. If None, the default table is 2172 used 2173 :type table: str 2174 :return: A string of the extra infos 2175 """ 2176 2177 return ", ".join( 2178 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2179 ) 2180 2181 def export_header( 2182 self, 2183 header_name: str = None, 2184 output_file: str = None, 2185 output_file_ext: str = ".hdr", 2186 clean_header: bool = True, 2187 remove_chrom_line: bool = False, 2188 ) -> str: 2189 """ 2190 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2191 specified options, and writes it to a new file. 2192 2193 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2194 this parameter is not specified, the header will be written to the output file 2195 :type header_name: str 2196 :param output_file: The `output_file` parameter in the `export_header` function is used to 2197 specify the name of the output file where the header will be written. If this parameter is not 2198 provided, the header will be written to a temporary file 2199 :type output_file: str 2200 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2201 string that represents the extension of the output header file. By default, it is set to ".hdr" 2202 if not specified by the user. This extension will be appended to the `output_file` name to 2203 create the final, defaults to .hdr 2204 :type output_file_ext: str (optional) 2205 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2206 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2207 `True`, the function will clean the header by modifying certain lines based on a specific 2208 pattern. If `clean_header`, defaults to True 2209 :type clean_header: bool (optional) 2210 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2211 boolean flag that determines whether the #CHROM line should be removed from the header before 2212 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2213 defaults to False 2214 :type remove_chrom_line: bool (optional) 2215 :return: The function `export_header` returns the name of the temporary header file that is 2216 created. 2217 """ 2218 2219 if not header_name and not output_file: 2220 output_file = self.get_output() 2221 2222 if self.get_header(): 2223 2224 # Get header object 2225 header_obj = self.get_header() 2226 2227 # Create database 2228 db_for_header = Database(database=self.get_input()) 2229 2230 # Get real columns in the file 2231 db_header_columns = db_for_header.get_columns() 2232 2233 with tempfile.TemporaryDirectory() as tmpdir: 2234 2235 # Write header file 2236 header_file_tmp = os.path.join(tmpdir, "header") 2237 f = open(header_file_tmp, "w") 2238 vcf.Writer(f, header_obj) 2239 f.close() 2240 2241 # Replace #CHROM line with rel columns 2242 header_list = db_for_header.read_header_file( 2243 header_file=header_file_tmp 2244 ) 2245 header_list[-1] = "\t".join(db_header_columns) 2246 2247 # Remove CHROM line 2248 if remove_chrom_line: 2249 header_list.pop() 2250 2251 # Clean header 2252 if clean_header: 2253 header_list_clean = [] 2254 for head in header_list: 2255 # Clean head for malformed header 2256 head_clean = head 2257 head_clean = re.subn( 2258 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2259 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2260 head_clean, 2261 2, 2262 )[0] 2263 # Write header 2264 header_list_clean.append(head_clean) 2265 header_list = header_list_clean 2266 2267 tmp_header_name = output_file + output_file_ext 2268 2269 f = open(tmp_header_name, "w") 2270 for line in header_list: 2271 f.write(line) 2272 f.close() 2273 2274 return tmp_header_name 2275 2276 def export_variant_vcf( 2277 self, 2278 vcf_file, 2279 remove_info: bool = False, 2280 add_samples: bool = True, 2281 list_samples: list = [], 2282 where_clause: str = "", 2283 index: bool = False, 2284 threads: int | None = None, 2285 ) -> bool | None: 2286 """ 2287 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2288 remove INFO field, add samples, and control compression and indexing. 2289 2290 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2291 written to. It is the output file that will contain the filtered VCF data based on the specified 2292 parameters 2293 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2294 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2295 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2296 in, defaults to False 2297 :type remove_info: bool (optional) 2298 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2299 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2300 If set to False, the samples will be removed. The default value is True, defaults to True 2301 :type add_samples: bool (optional) 2302 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2303 in the output VCF file. By default, all samples will be included. If you provide a list of 2304 samples, only those samples will be included in the output file 2305 :type list_samples: list 2306 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2307 determines whether or not to create an index for the output VCF file. If `index` is set to 2308 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2309 :type index: bool (optional) 2310 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2311 number of threads to use for exporting the VCF file. It determines how many parallel threads 2312 will be used during the export process. More threads can potentially speed up the export process 2313 by utilizing multiple cores of the processor. If 2314 :type threads: int | None 2315 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2316 method with various parameters including the output file, query, threads, sort flag, and index 2317 flag. The `export_output` method is responsible for exporting the VCF data based on the 2318 specified parameters and configurations provided in the `export_variant_vcf` function. 2319 """ 2320 2321 # Config 2322 config = self.get_config() 2323 2324 # Extract VCF 2325 log.debug("Export VCF...") 2326 2327 # Table variants 2328 table_variants = self.get_table_variants() 2329 2330 # Threads 2331 if not threads: 2332 threads = self.get_threads() 2333 2334 # Info fields 2335 if remove_info: 2336 if not isinstance(remove_info, str): 2337 remove_info = "." 2338 info_field = f"""'{remove_info}' as INFO""" 2339 else: 2340 info_field = "INFO" 2341 2342 # Samples fields 2343 if add_samples: 2344 if not list_samples: 2345 list_samples = self.get_header_sample_list() 2346 if list_samples: 2347 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2348 else: 2349 samples_fields = "" 2350 log.debug(f"samples_fields: {samples_fields}") 2351 else: 2352 samples_fields = "" 2353 2354 # Where clause 2355 if where_clause is None: 2356 where_clause = "" 2357 2358 # Variants 2359 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2360 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2361 log.debug(f"sql_query_select={sql_query_select}") 2362 2363 return self.export_output( 2364 output_file=vcf_file, 2365 output_header=None, 2366 export_header=True, 2367 query=sql_query_select, 2368 parquet_partitions=None, 2369 chunk_size=config.get("chunk_size", None), 2370 threads=threads, 2371 sort=True, 2372 index=index, 2373 order_by=None, 2374 ) 2375 2376 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2377 """ 2378 It takes a list of commands and runs them in parallel using the number of threads specified 2379 2380 :param commands: A list of commands to run 2381 :param threads: The number of threads to use, defaults to 1 (optional) 2382 """ 2383 2384 run_parallel_commands(commands, threads) 2385 2386 def get_threads(self, default: int = 1) -> int: 2387 """ 2388 This function returns the number of threads to use for a job, with a default value of 1 if not 2389 specified. 2390 2391 :param default: The `default` parameter in the `get_threads` method is used to specify the 2392 default number of threads to use if no specific value is provided. If no value is provided for 2393 the `threads` parameter in the configuration or input parameters, the `default` value will be 2394 used, defaults to 1 2395 :type default: int (optional) 2396 :return: the number of threads to use for the current job. 2397 """ 2398 2399 # Config 2400 config = self.get_config() 2401 2402 # Param 2403 param = self.get_param() 2404 2405 # Input threads 2406 input_thread = param.get("threads", config.get("threads", None)) 2407 2408 # Check threads 2409 if not input_thread: 2410 threads = default 2411 elif int(input_thread) <= 0: 2412 threads = os.cpu_count() 2413 else: 2414 threads = int(input_thread) 2415 return threads 2416 2417 def get_memory(self, default: str = None) -> str: 2418 """ 2419 This function retrieves the memory value from parameters or configuration with a default value 2420 if not found. 2421 2422 :param default: The `get_memory` function takes in a default value as a string parameter. This 2423 default value is used as a fallback in case the `memory` parameter is not provided in the 2424 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2425 the function 2426 :type default: str 2427 :return: The `get_memory` function returns a string value representing the memory parameter. If 2428 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2429 return the default value provided as an argument to the function. 2430 """ 2431 2432 # Config 2433 config = self.get_config() 2434 2435 # Param 2436 param = self.get_param() 2437 2438 # Input threads 2439 input_memory = param.get("memory", config.get("memory", None)) 2440 2441 # Check threads 2442 if input_memory: 2443 memory = input_memory 2444 else: 2445 memory = default 2446 2447 return memory 2448 2449 def update_from_vcf(self, vcf_file: str) -> None: 2450 """ 2451 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2452 2453 :param vcf_file: the path to the VCF file 2454 """ 2455 2456 connexion_format = self.get_connexion_format() 2457 2458 if connexion_format in ["duckdb"]: 2459 self.update_from_vcf_duckdb(vcf_file) 2460 elif connexion_format in ["sqlite"]: 2461 self.update_from_vcf_sqlite(vcf_file) 2462 2463 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2464 """ 2465 It takes a VCF file and updates the INFO column of the variants table in the database with the 2466 INFO column of the VCF file 2467 2468 :param vcf_file: the path to the VCF file 2469 """ 2470 2471 # varaints table 2472 table_variants = self.get_table_variants() 2473 2474 # Loading VCF into temporaire table 2475 skip = self.get_header_length(file=vcf_file) 2476 vcf_df = pd.read_csv( 2477 vcf_file, 2478 sep="\t", 2479 engine="c", 2480 skiprows=skip, 2481 header=0, 2482 low_memory=False, 2483 ) 2484 sql_query_update = f""" 2485 UPDATE {table_variants} as table_variants 2486 SET INFO = concat( 2487 CASE 2488 WHEN INFO NOT IN ('', '.') 2489 THEN INFO 2490 ELSE '' 2491 END, 2492 ( 2493 SELECT 2494 concat( 2495 CASE 2496 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2497 THEN ';' 2498 ELSE '' 2499 END 2500 , 2501 CASE 2502 WHEN table_parquet.INFO NOT IN ('','.') 2503 THEN table_parquet.INFO 2504 ELSE '' 2505 END 2506 ) 2507 FROM vcf_df as table_parquet 2508 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2509 AND table_parquet.\"POS\" = table_variants.\"POS\" 2510 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2511 AND table_parquet.\"REF\" = table_variants.\"REF\" 2512 AND table_parquet.INFO NOT IN ('','.') 2513 ) 2514 ) 2515 ; 2516 """ 2517 self.conn.execute(sql_query_update) 2518 2519 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2520 """ 2521 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2522 table, then updates the INFO column of the variants table with the INFO column of the temporary 2523 table 2524 2525 :param vcf_file: The path to the VCF file you want to update the database with 2526 """ 2527 2528 # Create a temporary table for the VCF 2529 table_vcf = "tmp_vcf" 2530 sql_create = ( 2531 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2532 ) 2533 self.conn.execute(sql_create) 2534 2535 # Loading VCF into temporaire table 2536 vcf_df = pd.read_csv( 2537 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2538 ) 2539 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2540 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2541 2542 # Update table 'variants' with VCF data 2543 # warning: CONCAT as || operator 2544 sql_query_update = f""" 2545 UPDATE variants as table_variants 2546 SET INFO = CASE 2547 WHEN INFO NOT IN ('', '.') 2548 THEN INFO 2549 ELSE '' 2550 END || 2551 ( 2552 SELECT 2553 CASE 2554 WHEN table_variants.INFO NOT IN ('','.') 2555 AND table_vcf.INFO NOT IN ('','.') 2556 THEN ';' 2557 ELSE '' 2558 END || 2559 CASE 2560 WHEN table_vcf.INFO NOT IN ('','.') 2561 THEN table_vcf.INFO 2562 ELSE '' 2563 END 2564 FROM {table_vcf} as table_vcf 2565 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2566 AND table_vcf.\"POS\" = table_variants.\"POS\" 2567 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2568 AND table_vcf.\"REF\" = table_variants.\"REF\" 2569 ) 2570 """ 2571 self.conn.execute(sql_query_update) 2572 2573 # Drop temporary table 2574 sql_drop = f"DROP TABLE {table_vcf}" 2575 self.conn.execute(sql_drop) 2576 2577 def drop_variants_table(self) -> None: 2578 """ 2579 > This function drops the variants table 2580 """ 2581 2582 table_variants = self.get_table_variants() 2583 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2584 self.conn.execute(sql_table_variants) 2585 2586 def set_variant_id( 2587 self, variant_id_column: str = "variant_id", force: bool = None 2588 ) -> str: 2589 """ 2590 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2591 `#CHROM`, `POS`, `REF`, and `ALT` columns 2592 2593 :param variant_id_column: The name of the column to be created in the variants table, defaults 2594 to variant_id 2595 :type variant_id_column: str (optional) 2596 :param force: If True, the variant_id column will be created even if it already exists 2597 :type force: bool 2598 :return: The name of the column that contains the variant_id 2599 """ 2600 2601 # Assembly 2602 assembly = self.get_param().get( 2603 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2604 ) 2605 2606 # INFO/Tag prefix 2607 prefix = self.get_explode_infos_prefix() 2608 2609 # Explode INFO/SVTYPE 2610 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2611 2612 # variants table 2613 table_variants = self.get_table_variants() 2614 2615 # variant_id column 2616 if not variant_id_column: 2617 variant_id_column = "variant_id" 2618 2619 # Creta variant_id column 2620 if "variant_id" not in self.get_extra_infos() or force: 2621 2622 # Create column 2623 self.add_column( 2624 table_name=table_variants, 2625 column_name=variant_id_column, 2626 column_type="UBIGINT", 2627 default_value="0", 2628 ) 2629 2630 # Update column 2631 self.conn.execute( 2632 f""" 2633 UPDATE {table_variants} 2634 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2635 """ 2636 ) 2637 2638 # Remove added columns 2639 for added_column in added_columns: 2640 self.drop_column(column=added_column) 2641 2642 # return variant_id column name 2643 return variant_id_column 2644 2645 def get_variant_id_column( 2646 self, variant_id_column: str = "variant_id", force: bool = None 2647 ) -> str: 2648 """ 2649 This function returns the variant_id column name 2650 2651 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2652 defaults to variant_id 2653 :type variant_id_column: str (optional) 2654 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2655 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2656 if it is not already set, or if it is set 2657 :type force: bool 2658 :return: The variant_id column name. 2659 """ 2660 2661 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2662 2663 ### 2664 # Annotation 2665 ### 2666 2667 def scan_databases( 2668 self, 2669 database_formats: list = ["parquet"], 2670 database_releases: list = ["current"], 2671 ) -> dict: 2672 """ 2673 The function `scan_databases` scans for available databases based on specified formats and 2674 releases. 2675 2676 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2677 of the databases to be scanned. In this case, the accepted format is "parquet" 2678 :type database_formats: list ["parquet"] 2679 :param database_releases: The `database_releases` parameter is a list that specifies the 2680 releases of the databases to be scanned. In the provided function, the default value for 2681 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2682 databases that are in the "current" 2683 :type database_releases: list 2684 :return: The function `scan_databases` returns a dictionary containing information about 2685 databases that match the specified formats and releases. 2686 """ 2687 2688 # Config 2689 config = self.get_config() 2690 2691 # Param 2692 param = self.get_param() 2693 2694 # Param - Assembly 2695 assembly = param.get("assembly", config.get("assembly", None)) 2696 if not assembly: 2697 assembly = DEFAULT_ASSEMBLY 2698 log.warning(f"Default assembly '{assembly}'") 2699 2700 # Scan for availabled databases 2701 log.info( 2702 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2703 ) 2704 databases_infos_dict = databases_infos( 2705 database_folder_releases=database_releases, 2706 database_formats=database_formats, 2707 assembly=assembly, 2708 config=config, 2709 ) 2710 log.info( 2711 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2712 ) 2713 2714 return databases_infos_dict 2715 2716 def annotation(self) -> None: 2717 """ 2718 It annotates the VCF file with the annotations specified in the config file. 2719 """ 2720 2721 # Config 2722 config = self.get_config() 2723 2724 # Param 2725 param = self.get_param() 2726 2727 # Param - Assembly 2728 assembly = param.get("assembly", config.get("assembly", None)) 2729 if not assembly: 2730 assembly = DEFAULT_ASSEMBLY 2731 log.warning(f"Default assembly '{assembly}'") 2732 2733 # annotations databases folders 2734 annotations_databases = set( 2735 config.get("folders", {}) 2736 .get("databases", {}) 2737 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2738 + config.get("folders", {}) 2739 .get("databases", {}) 2740 .get("parquet", ["~/howard/databases/parquet/current"]) 2741 + config.get("folders", {}) 2742 .get("databases", {}) 2743 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2744 ) 2745 2746 # Get param annotations 2747 if param.get("annotations", None) and isinstance( 2748 param.get("annotations", None), str 2749 ): 2750 log.debug(param.get("annotations", None)) 2751 param_annotation_list = param.get("annotations").split(",") 2752 else: 2753 param_annotation_list = [] 2754 2755 # Each tools param 2756 if param.get("annotation_parquet", None) != None: 2757 log.debug( 2758 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2759 ) 2760 if isinstance(param.get("annotation_parquet", None), list): 2761 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2762 else: 2763 param_annotation_list.append(param.get("annotation_parquet")) 2764 if param.get("annotation_snpsift", None) != None: 2765 if isinstance(param.get("annotation_snpsift", None), list): 2766 param_annotation_list.append( 2767 "snpsift:" 2768 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2769 ) 2770 else: 2771 param_annotation_list.append( 2772 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2773 ) 2774 if param.get("annotation_snpeff", None) != None: 2775 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2776 if param.get("annotation_bcftools", None) != None: 2777 if isinstance(param.get("annotation_bcftools", None), list): 2778 param_annotation_list.append( 2779 "bcftools:" 2780 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2781 ) 2782 else: 2783 param_annotation_list.append( 2784 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2785 ) 2786 if param.get("annotation_annovar", None) != None: 2787 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2788 if param.get("annotation_exomiser", None) != None: 2789 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2790 if param.get("annotation_splice", None) != None: 2791 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2792 2793 # Merge param annotations list 2794 param["annotations"] = ",".join(param_annotation_list) 2795 2796 # debug 2797 log.debug(f"param_annotations={param['annotations']}") 2798 2799 if param.get("annotations"): 2800 2801 # Log 2802 # log.info("Annotations - Check annotation parameters") 2803 2804 if not "annotation" in param: 2805 param["annotation"] = {} 2806 2807 # List of annotations parameters 2808 annotations_list_input = {} 2809 if isinstance(param.get("annotations", None), str): 2810 annotation_file_list = [ 2811 value for value in param.get("annotations", "").split(",") 2812 ] 2813 for annotation_file in annotation_file_list: 2814 annotations_list_input[annotation_file] = {"INFO": None} 2815 else: 2816 annotations_list_input = param.get("annotations", {}) 2817 2818 log.info(f"Quick Annotations:") 2819 for annotation_key in list(annotations_list_input.keys()): 2820 log.info(f" {annotation_key}") 2821 2822 # List of annotations and associated fields 2823 annotations_list = {} 2824 2825 for annotation_file in annotations_list_input: 2826 2827 # Explode annotations if ALL 2828 if ( 2829 annotation_file.upper() == "ALL" 2830 or annotation_file.upper().startswith("ALL:") 2831 ): 2832 2833 # check ALL parameters (formats, releases) 2834 annotation_file_split = annotation_file.split(":") 2835 database_formats = "parquet" 2836 database_releases = "current" 2837 for annotation_file_option in annotation_file_split[1:]: 2838 database_all_options_split = annotation_file_option.split("=") 2839 if database_all_options_split[0] == "format": 2840 database_formats = database_all_options_split[1].split("+") 2841 if database_all_options_split[0] == "release": 2842 database_releases = database_all_options_split[1].split("+") 2843 2844 # Scan for availabled databases 2845 databases_infos_dict = self.scan_databases( 2846 database_formats=database_formats, 2847 database_releases=database_releases, 2848 ) 2849 2850 # Add found databases in annotation parameters 2851 for database_infos in databases_infos_dict.keys(): 2852 annotations_list[database_infos] = {"INFO": None} 2853 2854 else: 2855 annotations_list[annotation_file] = annotations_list_input[ 2856 annotation_file 2857 ] 2858 2859 # Check each databases 2860 if len(annotations_list): 2861 2862 log.info( 2863 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2864 ) 2865 2866 for annotation_file in annotations_list: 2867 2868 # Init 2869 annotations = annotations_list.get(annotation_file, None) 2870 2871 # Annotation snpEff 2872 if annotation_file.startswith("snpeff"): 2873 2874 log.debug(f"Quick Annotation snpEff") 2875 2876 if "snpeff" not in param["annotation"]: 2877 param["annotation"]["snpeff"] = {} 2878 2879 if "options" not in param["annotation"]["snpeff"]: 2880 param["annotation"]["snpeff"]["options"] = "" 2881 2882 # snpEff options in annotations 2883 param["annotation"]["snpeff"]["options"] = "".join( 2884 annotation_file.split(":")[1:] 2885 ) 2886 2887 # Annotation Annovar 2888 elif annotation_file.startswith("annovar"): 2889 2890 log.debug(f"Quick Annotation Annovar") 2891 2892 if "annovar" not in param["annotation"]: 2893 param["annotation"]["annovar"] = {} 2894 2895 if "annotations" not in param["annotation"]["annovar"]: 2896 param["annotation"]["annovar"]["annotations"] = {} 2897 2898 # Options 2899 annotation_file_split = annotation_file.split(":") 2900 for annotation_file_annotation in annotation_file_split[1:]: 2901 if annotation_file_annotation: 2902 param["annotation"]["annovar"]["annotations"][ 2903 annotation_file_annotation 2904 ] = annotations 2905 2906 # Annotation Exomiser 2907 elif annotation_file.startswith("exomiser"): 2908 2909 log.debug(f"Quick Annotation Exomiser") 2910 2911 param["annotation"]["exomiser"] = params_string_to_dict( 2912 annotation_file 2913 ) 2914 2915 # Annotation Splice 2916 elif annotation_file.startswith("splice"): 2917 2918 log.debug(f"Quick Annotation Splice") 2919 2920 param["annotation"]["splice"] = params_string_to_dict( 2921 annotation_file 2922 ) 2923 2924 # Annotation Parquet or BCFTOOLS 2925 else: 2926 2927 # Tools detection 2928 if annotation_file.startswith("bcftools:"): 2929 annotation_tool_initial = "bcftools" 2930 annotation_file = ":".join(annotation_file.split(":")[1:]) 2931 elif annotation_file.startswith("snpsift:"): 2932 annotation_tool_initial = "snpsift" 2933 annotation_file = ":".join(annotation_file.split(":")[1:]) 2934 else: 2935 annotation_tool_initial = None 2936 2937 # list of files 2938 annotation_file_list = annotation_file.replace("+", ":").split( 2939 ":" 2940 ) 2941 2942 for annotation_file in annotation_file_list: 2943 2944 if annotation_file: 2945 2946 # Annotation tool initial 2947 annotation_tool = annotation_tool_initial 2948 2949 # Find file 2950 annotation_file_found = None 2951 2952 # Expand user 2953 annotation_file = full_path(annotation_file) 2954 2955 if os.path.exists(annotation_file): 2956 annotation_file_found = annotation_file 2957 2958 else: 2959 # Find within assembly folders 2960 for annotations_database in annotations_databases: 2961 found_files = find_all( 2962 annotation_file, 2963 os.path.join( 2964 annotations_database, assembly 2965 ), 2966 ) 2967 if len(found_files) > 0: 2968 annotation_file_found = found_files[0] 2969 break 2970 if not annotation_file_found and not assembly: 2971 # Find within folders 2972 for ( 2973 annotations_database 2974 ) in annotations_databases: 2975 found_files = find_all( 2976 annotation_file, annotations_database 2977 ) 2978 if len(found_files) > 0: 2979 annotation_file_found = found_files[0] 2980 break 2981 log.debug( 2982 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2983 ) 2984 2985 # Full path 2986 annotation_file_found = full_path(annotation_file_found) 2987 2988 if annotation_file_found: 2989 2990 database = Database(database=annotation_file_found) 2991 quick_annotation_format = database.get_format() 2992 quick_annotation_is_compressed = ( 2993 database.is_compressed() 2994 ) 2995 quick_annotation_is_indexed = os.path.exists( 2996 f"{annotation_file_found}.tbi" 2997 ) 2998 bcftools_preference = False 2999 3000 # Check Annotation Tool 3001 if not annotation_tool: 3002 if ( 3003 bcftools_preference 3004 and quick_annotation_format 3005 in ["vcf", "bed"] 3006 and quick_annotation_is_compressed 3007 and quick_annotation_is_indexed 3008 ): 3009 annotation_tool = "bcftools" 3010 elif quick_annotation_format in [ 3011 "vcf", 3012 "bed", 3013 "tsv", 3014 "tsv", 3015 "csv", 3016 "json", 3017 "tbl", 3018 "parquet", 3019 "duckdb", 3020 ]: 3021 annotation_tool = "parquet" 3022 else: 3023 log.error( 3024 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3025 ) 3026 raise ValueError( 3027 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3028 ) 3029 3030 log.debug( 3031 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3032 ) 3033 3034 # Annotation Tool dispatch 3035 if annotation_tool: 3036 if annotation_tool not in param["annotation"]: 3037 param["annotation"][annotation_tool] = {} 3038 if ( 3039 "annotations" 3040 not in param["annotation"][annotation_tool] 3041 ): 3042 param["annotation"][annotation_tool][ 3043 "annotations" 3044 ] = {} 3045 param["annotation"][annotation_tool][ 3046 "annotations" 3047 ][annotation_file_found] = annotations 3048 3049 else: 3050 log.error( 3051 f"Quick Annotation File {annotation_file} does NOT exist" 3052 ) 3053 3054 self.set_param(param) 3055 3056 if param.get("annotation", None): 3057 log.info("Annotations") 3058 if param.get("annotation", {}).get("parquet", None): 3059 log.info("Annotations 'parquet'...") 3060 self.annotation_parquet() 3061 if param.get("annotation", {}).get("bcftools", None): 3062 log.info("Annotations 'bcftools'...") 3063 self.annotation_bcftools() 3064 if param.get("annotation", {}).get("snpsift", None): 3065 log.info("Annotations 'snpsift'...") 3066 self.annotation_snpsift() 3067 if param.get("annotation", {}).get("annovar", None): 3068 log.info("Annotations 'annovar'...") 3069 self.annotation_annovar() 3070 if param.get("annotation", {}).get("snpeff", None): 3071 log.info("Annotations 'snpeff'...") 3072 self.annotation_snpeff() 3073 if param.get("annotation", {}).get("exomiser", None) is not None: 3074 log.info("Annotations 'exomiser'...") 3075 self.annotation_exomiser() 3076 if param.get("annotation", {}).get("splice", None) is not None: 3077 log.info("Annotations 'splice' ...") 3078 self.annotation_splice() 3079 3080 # Explode INFOS fields into table fields 3081 if self.get_explode_infos(): 3082 self.explode_infos( 3083 prefix=self.get_explode_infos_prefix(), 3084 fields=self.get_explode_infos_fields(), 3085 force=True, 3086 ) 3087 3088 def annotation_snpsift(self, threads: int = None) -> None: 3089 """ 3090 This function annotate with bcftools 3091 3092 :param threads: Number of threads to use 3093 :return: the value of the variable "return_value". 3094 """ 3095 3096 # DEBUG 3097 log.debug("Start annotation with bcftools databases") 3098 3099 # Threads 3100 if not threads: 3101 threads = self.get_threads() 3102 log.debug("Threads: " + str(threads)) 3103 3104 # Config 3105 config = self.get_config() 3106 log.debug("Config: " + str(config)) 3107 3108 # Config - snpSift 3109 snpsift_bin_command = get_bin_command( 3110 bin="SnpSift.jar", 3111 tool="snpsift", 3112 bin_type="jar", 3113 config=config, 3114 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3115 ) 3116 if not snpsift_bin_command: 3117 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3118 log.error(msg_err) 3119 raise ValueError(msg_err) 3120 3121 # Config - bcftools 3122 bcftools_bin_command = get_bin_command( 3123 bin="bcftools", 3124 tool="bcftools", 3125 bin_type="bin", 3126 config=config, 3127 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3128 ) 3129 if not bcftools_bin_command: 3130 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3131 log.error(msg_err) 3132 raise ValueError(msg_err) 3133 3134 # Config - BCFTools databases folders 3135 databases_folders = set( 3136 self.get_config() 3137 .get("folders", {}) 3138 .get("databases", {}) 3139 .get("annotations", ["."]) 3140 + self.get_config() 3141 .get("folders", {}) 3142 .get("databases", {}) 3143 .get("bcftools", ["."]) 3144 ) 3145 log.debug("Databases annotations: " + str(databases_folders)) 3146 3147 # Param 3148 annotations = ( 3149 self.get_param() 3150 .get("annotation", {}) 3151 .get("snpsift", {}) 3152 .get("annotations", None) 3153 ) 3154 log.debug("Annotations: " + str(annotations)) 3155 3156 # Assembly 3157 assembly = self.get_param().get( 3158 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3159 ) 3160 3161 # Data 3162 table_variants = self.get_table_variants() 3163 3164 # Check if not empty 3165 log.debug("Check if not empty") 3166 sql_query_chromosomes = ( 3167 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3168 ) 3169 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3170 if not sql_query_chromosomes_df["count"][0]: 3171 log.info(f"VCF empty") 3172 return 3173 3174 # VCF header 3175 vcf_reader = self.get_header() 3176 log.debug("Initial header: " + str(vcf_reader.infos)) 3177 3178 # Existing annotations 3179 for vcf_annotation in self.get_header().infos: 3180 3181 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3182 log.debug( 3183 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3184 ) 3185 3186 if annotations: 3187 3188 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3189 3190 # Export VCF file 3191 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3192 3193 # Init 3194 commands = {} 3195 3196 for annotation in annotations: 3197 annotation_fields = annotations[annotation] 3198 3199 # Annotation Name 3200 annotation_name = os.path.basename(annotation) 3201 3202 if not annotation_fields: 3203 annotation_fields = {"INFO": None} 3204 3205 log.debug(f"Annotation '{annotation_name}'") 3206 log.debug( 3207 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3208 ) 3209 3210 # Create Database 3211 database = Database( 3212 database=annotation, 3213 databases_folders=databases_folders, 3214 assembly=assembly, 3215 ) 3216 3217 # Find files 3218 db_file = database.get_database() 3219 db_file = full_path(db_file) 3220 db_hdr_file = database.get_header_file() 3221 db_hdr_file = full_path(db_hdr_file) 3222 db_file_type = database.get_format() 3223 db_tbi_file = f"{db_file}.tbi" 3224 db_file_compressed = database.is_compressed() 3225 3226 # Check if compressed 3227 if not db_file_compressed: 3228 log.error( 3229 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3230 ) 3231 raise ValueError( 3232 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3233 ) 3234 3235 # Check if indexed 3236 if not os.path.exists(db_tbi_file): 3237 log.error( 3238 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3239 ) 3240 raise ValueError( 3241 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3242 ) 3243 3244 # Check index - try to create if not exists 3245 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3246 log.error("Annotation failed: database not valid") 3247 log.error(f"Annotation annotation file: {db_file}") 3248 log.error(f"Annotation annotation header: {db_hdr_file}") 3249 log.error(f"Annotation annotation index: {db_tbi_file}") 3250 raise ValueError( 3251 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3252 ) 3253 else: 3254 3255 log.debug( 3256 f"Annotation '{annotation}' - file: " 3257 + str(db_file) 3258 + " and " 3259 + str(db_hdr_file) 3260 ) 3261 3262 # Load header as VCF object 3263 db_hdr_vcf = Variants(input=db_hdr_file) 3264 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3265 log.debug( 3266 "Annotation database header: " 3267 + str(db_hdr_vcf_header_infos) 3268 ) 3269 3270 # For all fields in database 3271 annotation_fields_full = False 3272 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3273 annotation_fields = { 3274 key: key for key in db_hdr_vcf_header_infos 3275 } 3276 log.debug( 3277 "Annotation database header - All annotations added: " 3278 + str(annotation_fields) 3279 ) 3280 annotation_fields_full = True 3281 3282 # # Create file for field rename 3283 # log.debug("Create file for field rename") 3284 # tmp_rename = NamedTemporaryFile( 3285 # prefix=self.get_prefix(), 3286 # dir=self.get_tmp_dir(), 3287 # suffix=".rename", 3288 # delete=False, 3289 # ) 3290 # tmp_rename_name = tmp_rename.name 3291 # tmp_files.append(tmp_rename_name) 3292 3293 # Number of fields 3294 nb_annotation_field = 0 3295 annotation_list = [] 3296 annotation_infos_rename_list = [] 3297 3298 for annotation_field in annotation_fields: 3299 3300 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3301 annotation_fields_new_name = annotation_fields.get( 3302 annotation_field, annotation_field 3303 ) 3304 if not annotation_fields_new_name: 3305 annotation_fields_new_name = annotation_field 3306 3307 # Check if field is in DB and if field is not elready in input data 3308 if ( 3309 annotation_field in db_hdr_vcf.get_header().infos 3310 and annotation_fields_new_name 3311 not in self.get_header().infos 3312 ): 3313 3314 log.info( 3315 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3316 ) 3317 3318 # BCFTools annotate param to rename fields 3319 if annotation_field != annotation_fields_new_name: 3320 annotation_infos_rename_list.append( 3321 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3322 ) 3323 3324 # Add INFO field to header 3325 db_hdr_vcf_header_infos_number = ( 3326 db_hdr_vcf_header_infos[annotation_field].num or "." 3327 ) 3328 db_hdr_vcf_header_infos_type = ( 3329 db_hdr_vcf_header_infos[annotation_field].type 3330 or "String" 3331 ) 3332 db_hdr_vcf_header_infos_description = ( 3333 db_hdr_vcf_header_infos[annotation_field].desc 3334 or f"{annotation_field} description" 3335 ) 3336 db_hdr_vcf_header_infos_source = ( 3337 db_hdr_vcf_header_infos[annotation_field].source 3338 or "unknown" 3339 ) 3340 db_hdr_vcf_header_infos_version = ( 3341 db_hdr_vcf_header_infos[annotation_field].version 3342 or "unknown" 3343 ) 3344 3345 vcf_reader.infos[annotation_fields_new_name] = ( 3346 vcf.parser._Info( 3347 annotation_fields_new_name, 3348 db_hdr_vcf_header_infos_number, 3349 db_hdr_vcf_header_infos_type, 3350 db_hdr_vcf_header_infos_description, 3351 db_hdr_vcf_header_infos_source, 3352 db_hdr_vcf_header_infos_version, 3353 self.code_type_map[ 3354 db_hdr_vcf_header_infos_type 3355 ], 3356 ) 3357 ) 3358 3359 annotation_list.append(annotation_field) 3360 3361 nb_annotation_field += 1 3362 3363 else: 3364 3365 if ( 3366 annotation_field 3367 not in db_hdr_vcf.get_header().infos 3368 ): 3369 log.warning( 3370 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3371 ) 3372 if ( 3373 annotation_fields_new_name 3374 in self.get_header().infos 3375 ): 3376 log.warning( 3377 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3378 ) 3379 3380 log.info( 3381 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3382 ) 3383 3384 annotation_infos = ",".join(annotation_list) 3385 3386 if annotation_infos != "": 3387 3388 # Annotated VCF (and error file) 3389 tmp_annotation_vcf_name = os.path.join( 3390 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3391 ) 3392 tmp_annotation_vcf_name_err = ( 3393 tmp_annotation_vcf_name + ".err" 3394 ) 3395 3396 # Add fields to annotate 3397 if not annotation_fields_full: 3398 annotation_infos_option = f"-info {annotation_infos}" 3399 else: 3400 annotation_infos_option = "" 3401 3402 # Info fields rename 3403 if annotation_infos_rename_list: 3404 annotation_infos_rename = " -c " + ",".join( 3405 annotation_infos_rename_list 3406 ) 3407 else: 3408 annotation_infos_rename = "" 3409 3410 # Annotate command 3411 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3412 3413 # Add command 3414 commands[command_annotate] = tmp_annotation_vcf_name 3415 3416 if commands: 3417 3418 # Export VCF file 3419 self.export_variant_vcf( 3420 vcf_file=tmp_vcf_name, 3421 remove_info=True, 3422 add_samples=False, 3423 index=True, 3424 ) 3425 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3426 3427 # Num command 3428 nb_command = 0 3429 3430 # Annotate 3431 for command_annotate in commands: 3432 nb_command += 1 3433 log.info( 3434 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3435 ) 3436 log.debug(f"command_annotate={command_annotate}") 3437 run_parallel_commands([command_annotate], threads) 3438 3439 # Debug 3440 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3441 3442 # Update variants 3443 log.info( 3444 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3445 ) 3446 self.update_from_vcf(commands[command_annotate]) 3447 3448 def annotation_bcftools(self, threads: int = None) -> None: 3449 """ 3450 This function annotate with bcftools 3451 3452 :param threads: Number of threads to use 3453 :return: the value of the variable "return_value". 3454 """ 3455 3456 # DEBUG 3457 log.debug("Start annotation with bcftools databases") 3458 3459 # Threads 3460 if not threads: 3461 threads = self.get_threads() 3462 log.debug("Threads: " + str(threads)) 3463 3464 # Config 3465 config = self.get_config() 3466 log.debug("Config: " + str(config)) 3467 3468 # DEBUG 3469 delete_tmp = True 3470 if self.get_config().get("verbosity", "warning") in ["debug"]: 3471 delete_tmp = False 3472 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3473 3474 # Config - BCFTools bin command 3475 bcftools_bin_command = get_bin_command( 3476 bin="bcftools", 3477 tool="bcftools", 3478 bin_type="bin", 3479 config=config, 3480 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3481 ) 3482 if not bcftools_bin_command: 3483 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3484 log.error(msg_err) 3485 raise ValueError(msg_err) 3486 3487 # Config - BCFTools databases folders 3488 databases_folders = set( 3489 self.get_config() 3490 .get("folders", {}) 3491 .get("databases", {}) 3492 .get("annotations", ["."]) 3493 + self.get_config() 3494 .get("folders", {}) 3495 .get("databases", {}) 3496 .get("bcftools", ["."]) 3497 ) 3498 log.debug("Databases annotations: " + str(databases_folders)) 3499 3500 # Param 3501 annotations = ( 3502 self.get_param() 3503 .get("annotation", {}) 3504 .get("bcftools", {}) 3505 .get("annotations", None) 3506 ) 3507 log.debug("Annotations: " + str(annotations)) 3508 3509 # Assembly 3510 assembly = self.get_param().get( 3511 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3512 ) 3513 3514 # Data 3515 table_variants = self.get_table_variants() 3516 3517 # Check if not empty 3518 log.debug("Check if not empty") 3519 sql_query_chromosomes = ( 3520 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3521 ) 3522 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3523 if not sql_query_chromosomes_df["count"][0]: 3524 log.info(f"VCF empty") 3525 return 3526 3527 # Export in VCF 3528 log.debug("Create initial file to annotate") 3529 tmp_vcf = NamedTemporaryFile( 3530 prefix=self.get_prefix(), 3531 dir=self.get_tmp_dir(), 3532 suffix=".vcf.gz", 3533 delete=False, 3534 ) 3535 tmp_vcf_name = tmp_vcf.name 3536 3537 # VCF header 3538 vcf_reader = self.get_header() 3539 log.debug("Initial header: " + str(vcf_reader.infos)) 3540 3541 # Existing annotations 3542 for vcf_annotation in self.get_header().infos: 3543 3544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3545 log.debug( 3546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3547 ) 3548 3549 if annotations: 3550 3551 tmp_ann_vcf_list = [] 3552 commands = [] 3553 tmp_files = [] 3554 err_files = [] 3555 3556 for annotation in annotations: 3557 annotation_fields = annotations[annotation] 3558 3559 # Annotation Name 3560 annotation_name = os.path.basename(annotation) 3561 3562 if not annotation_fields: 3563 annotation_fields = {"INFO": None} 3564 3565 log.debug(f"Annotation '{annotation_name}'") 3566 log.debug( 3567 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3568 ) 3569 3570 # Create Database 3571 database = Database( 3572 database=annotation, 3573 databases_folders=databases_folders, 3574 assembly=assembly, 3575 ) 3576 3577 # Find files 3578 db_file = database.get_database() 3579 db_file = full_path(db_file) 3580 db_hdr_file = database.get_header_file() 3581 db_hdr_file = full_path(db_hdr_file) 3582 db_file_type = database.get_format() 3583 db_tbi_file = f"{db_file}.tbi" 3584 db_file_compressed = database.is_compressed() 3585 3586 # Check if compressed 3587 if not db_file_compressed: 3588 log.error( 3589 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3590 ) 3591 raise ValueError( 3592 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3593 ) 3594 3595 # Check if indexed 3596 if not os.path.exists(db_tbi_file): 3597 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3598 raise ValueError( 3599 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3600 ) 3601 3602 # Check index - try to create if not exists 3603 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3604 log.error("Annotation failed: database not valid") 3605 log.error(f"Annotation annotation file: {db_file}") 3606 log.error(f"Annotation annotation header: {db_hdr_file}") 3607 log.error(f"Annotation annotation index: {db_tbi_file}") 3608 raise ValueError( 3609 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3610 ) 3611 else: 3612 3613 log.debug( 3614 f"Annotation '{annotation}' - file: " 3615 + str(db_file) 3616 + " and " 3617 + str(db_hdr_file) 3618 ) 3619 3620 # Load header as VCF object 3621 db_hdr_vcf = Variants(input=db_hdr_file) 3622 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3623 log.debug( 3624 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3625 ) 3626 3627 # For all fields in database 3628 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3629 annotation_fields = { 3630 key: key for key in db_hdr_vcf_header_infos 3631 } 3632 log.debug( 3633 "Annotation database header - All annotations added: " 3634 + str(annotation_fields) 3635 ) 3636 3637 # Number of fields 3638 nb_annotation_field = 0 3639 annotation_list = [] 3640 3641 for annotation_field in annotation_fields: 3642 3643 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3644 annotation_fields_new_name = annotation_fields.get( 3645 annotation_field, annotation_field 3646 ) 3647 if not annotation_fields_new_name: 3648 annotation_fields_new_name = annotation_field 3649 3650 # Check if field is in DB and if field is not elready in input data 3651 if ( 3652 annotation_field in db_hdr_vcf.get_header().infos 3653 and annotation_fields_new_name 3654 not in self.get_header().infos 3655 ): 3656 3657 log.info( 3658 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3659 ) 3660 3661 # Add INFO field to header 3662 db_hdr_vcf_header_infos_number = ( 3663 db_hdr_vcf_header_infos[annotation_field].num or "." 3664 ) 3665 db_hdr_vcf_header_infos_type = ( 3666 db_hdr_vcf_header_infos[annotation_field].type 3667 or "String" 3668 ) 3669 db_hdr_vcf_header_infos_description = ( 3670 db_hdr_vcf_header_infos[annotation_field].desc 3671 or f"{annotation_field} description" 3672 ) 3673 db_hdr_vcf_header_infos_source = ( 3674 db_hdr_vcf_header_infos[annotation_field].source 3675 or "unknown" 3676 ) 3677 db_hdr_vcf_header_infos_version = ( 3678 db_hdr_vcf_header_infos[annotation_field].version 3679 or "unknown" 3680 ) 3681 3682 vcf_reader.infos[annotation_fields_new_name] = ( 3683 vcf.parser._Info( 3684 annotation_fields_new_name, 3685 db_hdr_vcf_header_infos_number, 3686 db_hdr_vcf_header_infos_type, 3687 db_hdr_vcf_header_infos_description, 3688 db_hdr_vcf_header_infos_source, 3689 db_hdr_vcf_header_infos_version, 3690 self.code_type_map[db_hdr_vcf_header_infos_type], 3691 ) 3692 ) 3693 3694 # annotation_list.append(annotation_field) 3695 if annotation_field != annotation_fields_new_name: 3696 annotation_list.append( 3697 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3698 ) 3699 else: 3700 annotation_list.append(annotation_field) 3701 3702 nb_annotation_field += 1 3703 3704 else: 3705 3706 if annotation_field not in db_hdr_vcf.get_header().infos: 3707 log.warning( 3708 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3709 ) 3710 if annotation_fields_new_name in self.get_header().infos: 3711 log.warning( 3712 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3713 ) 3714 3715 log.info( 3716 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3717 ) 3718 3719 annotation_infos = ",".join(annotation_list) 3720 3721 if annotation_infos != "": 3722 3723 # Protect header for bcftools (remove "#CHROM" and variants line) 3724 log.debug("Protect Header file - remove #CHROM line if exists") 3725 tmp_header_vcf = NamedTemporaryFile( 3726 prefix=self.get_prefix(), 3727 dir=self.get_tmp_dir(), 3728 suffix=".hdr", 3729 delete=False, 3730 ) 3731 tmp_header_vcf_name = tmp_header_vcf.name 3732 tmp_files.append(tmp_header_vcf_name) 3733 # Command 3734 if db_hdr_file.endswith(".gz"): 3735 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3736 else: 3737 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3738 # Run 3739 run_parallel_commands([command_extract_header], 1) 3740 3741 # Find chomosomes 3742 log.debug("Find chromosomes ") 3743 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3744 sql_query_chromosomes_df = self.get_query_to_df( 3745 sql_query_chromosomes 3746 ) 3747 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3748 3749 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3750 3751 # BED columns in the annotation file 3752 if db_file_type in ["bed"]: 3753 annotation_infos = "CHROM,POS,POS," + annotation_infos 3754 3755 for chrom in chomosomes_list: 3756 3757 # Create BED on initial VCF 3758 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3759 tmp_bed = NamedTemporaryFile( 3760 prefix=self.get_prefix(), 3761 dir=self.get_tmp_dir(), 3762 suffix=".bed", 3763 delete=False, 3764 ) 3765 tmp_bed_name = tmp_bed.name 3766 tmp_files.append(tmp_bed_name) 3767 3768 # Detecte regions 3769 log.debug( 3770 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3771 ) 3772 window = 1000000 3773 sql_query_intervals_for_bed = f""" 3774 SELECT \"#CHROM\", 3775 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3776 \"POS\"+{window} 3777 FROM {table_variants} as table_variants 3778 WHERE table_variants.\"#CHROM\" = '{chrom}' 3779 """ 3780 regions = self.conn.execute( 3781 sql_query_intervals_for_bed 3782 ).fetchall() 3783 merged_regions = merge_regions(regions) 3784 log.debug( 3785 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3786 ) 3787 3788 header = ["#CHROM", "START", "END"] 3789 with open(tmp_bed_name, "w") as f: 3790 # Write the header with tab delimiter 3791 f.write("\t".join(header) + "\n") 3792 for d in merged_regions: 3793 # Write each data row with tab delimiter 3794 f.write("\t".join(map(str, d)) + "\n") 3795 3796 # Tmp files 3797 tmp_annotation_vcf = NamedTemporaryFile( 3798 prefix=self.get_prefix(), 3799 dir=self.get_tmp_dir(), 3800 suffix=".vcf.gz", 3801 delete=False, 3802 ) 3803 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3804 tmp_files.append(tmp_annotation_vcf_name) 3805 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3806 tmp_annotation_vcf_name_err = ( 3807 tmp_annotation_vcf_name + ".err" 3808 ) 3809 err_files.append(tmp_annotation_vcf_name_err) 3810 3811 # Annotate Command 3812 log.debug( 3813 f"Annotation '{annotation}' - add bcftools command" 3814 ) 3815 3816 # Command 3817 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3818 3819 # Add command 3820 commands.append(command_annotate) 3821 3822 # if some commands 3823 if commands: 3824 3825 # Export VCF file 3826 self.export_variant_vcf( 3827 vcf_file=tmp_vcf_name, 3828 remove_info=True, 3829 add_samples=False, 3830 index=True, 3831 ) 3832 3833 # Threads 3834 # calculate threads for annotated commands 3835 if commands: 3836 threads_bcftools_annotate = round(threads / len(commands)) 3837 else: 3838 threads_bcftools_annotate = 1 3839 3840 if not threads_bcftools_annotate: 3841 threads_bcftools_annotate = 1 3842 3843 # Add threads option to bcftools commands 3844 if threads_bcftools_annotate > 1: 3845 commands_threaded = [] 3846 for command in commands: 3847 commands_threaded.append( 3848 command.replace( 3849 f"{bcftools_bin_command} annotate ", 3850 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3851 ) 3852 ) 3853 commands = commands_threaded 3854 3855 # Command annotation multithreading 3856 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3857 log.info( 3858 f"Annotation - Annotation multithreaded in " 3859 + str(len(commands)) 3860 + " commands" 3861 ) 3862 3863 run_parallel_commands(commands, threads) 3864 3865 # Merge 3866 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3867 3868 if tmp_ann_vcf_list_cmd: 3869 3870 # Tmp file 3871 tmp_annotate_vcf = NamedTemporaryFile( 3872 prefix=self.get_prefix(), 3873 dir=self.get_tmp_dir(), 3874 suffix=".vcf.gz", 3875 delete=True, 3876 ) 3877 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3878 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3879 err_files.append(tmp_annotate_vcf_name_err) 3880 3881 # Tmp file remove command 3882 tmp_files_remove_command = "" 3883 if tmp_files: 3884 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3885 3886 # Command merge 3887 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3888 log.info( 3889 f"Annotation - Annotation merging " 3890 + str(len(commands)) 3891 + " annotated files" 3892 ) 3893 log.debug(f"Annotation - merge command: {merge_command}") 3894 run_parallel_commands([merge_command], 1) 3895 3896 # Error messages 3897 log.info(f"Error/Warning messages:") 3898 error_message_command_all = [] 3899 error_message_command_warning = [] 3900 error_message_command_err = [] 3901 for err_file in err_files: 3902 with open(err_file, "r") as f: 3903 for line in f: 3904 message = line.strip() 3905 error_message_command_all.append(message) 3906 if line.startswith("[W::"): 3907 error_message_command_warning.append(message) 3908 if line.startswith("[E::"): 3909 error_message_command_err.append( 3910 f"{err_file}: " + message 3911 ) 3912 # log info 3913 for message in list( 3914 set(error_message_command_err + error_message_command_warning) 3915 ): 3916 log.info(f" {message}") 3917 # debug info 3918 for message in list(set(error_message_command_all)): 3919 log.debug(f" {message}") 3920 # failed 3921 if len(error_message_command_err): 3922 log.error("Annotation failed: Error in commands") 3923 raise ValueError("Annotation failed: Error in commands") 3924 3925 # Update variants 3926 log.info(f"Annotation - Updating...") 3927 self.update_from_vcf(tmp_annotate_vcf_name) 3928 3929 def annotation_exomiser(self, threads: int = None) -> None: 3930 """ 3931 This function annotate with Exomiser 3932 3933 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3934 - "analysis" (dict/file): 3935 Full analysis dictionnary parameters (see Exomiser docs). 3936 Either a dict, or a file in JSON or YAML format. 3937 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3938 Default : None 3939 - "preset" (string): 3940 Analysis preset (available in config folder). 3941 Used if no full "analysis" is provided. 3942 Default: "exome" 3943 - "phenopacket" (dict/file): 3944 Samples and phenotipic features parameters (see Exomiser docs). 3945 Either a dict, or a file in JSON or YAML format. 3946 Default: None 3947 - "subject" (dict): 3948 Sample parameters (see Exomiser docs). 3949 Example: 3950 "subject": 3951 { 3952 "id": "ISDBM322017", 3953 "sex": "FEMALE" 3954 } 3955 Default: None 3956 - "sample" (string): 3957 Sample name to construct "subject" section: 3958 "subject": 3959 { 3960 "id": "<sample>", 3961 "sex": "UNKNOWN_SEX" 3962 } 3963 Default: None 3964 - "phenotypicFeatures" (dict) 3965 Phenotypic features to construct "subject" section. 3966 Example: 3967 "phenotypicFeatures": 3968 [ 3969 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3970 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3971 ] 3972 - "hpo" (list) 3973 List of HPO ids as phenotypic features. 3974 Example: 3975 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3976 Default: [] 3977 - "outputOptions" (dict): 3978 Output options (see Exomiser docs). 3979 Default: 3980 "output_options" = 3981 { 3982 "outputContributingVariantsOnly": False, 3983 "numGenes": 0, 3984 "outputFormats": ["TSV_VARIANT", "VCF"] 3985 } 3986 - "transcript_source" (string): 3987 Transcript source (either "refseq", "ucsc", "ensembl") 3988 Default: "refseq" 3989 - "exomiser_to_info" (boolean): 3990 Add exomiser TSV file columns as INFO fields in VCF. 3991 Default: False 3992 - "release" (string): 3993 Exomise database release. 3994 If not exists, database release will be downloaded (take a while). 3995 Default: None (provided by application.properties configuration file) 3996 - "exomiser_application_properties" (file): 3997 Exomiser configuration file (see Exomiser docs). 3998 Useful to automatically download databases (especially for specific genome databases). 3999 4000 Notes: 4001 - If no sample in parameters, first sample in VCF will be chosen 4002 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4003 4004 :param threads: The number of threads to use 4005 :return: None. 4006 """ 4007 4008 # DEBUG 4009 log.debug("Start annotation with Exomiser databases") 4010 4011 # Threads 4012 if not threads: 4013 threads = self.get_threads() 4014 log.debug("Threads: " + str(threads)) 4015 4016 # Config 4017 config = self.get_config() 4018 log.debug("Config: " + str(config)) 4019 4020 # Config - Folders - Databases 4021 databases_folders = ( 4022 config.get("folders", {}) 4023 .get("databases", {}) 4024 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4025 ) 4026 databases_folders = full_path(databases_folders) 4027 if not os.path.exists(databases_folders): 4028 log.error(f"Databases annotations: {databases_folders} NOT found") 4029 log.debug("Databases annotations: " + str(databases_folders)) 4030 4031 # Config - Exomiser 4032 exomiser_bin_command = get_bin_command( 4033 bin="exomiser-cli*.jar", 4034 tool="exomiser", 4035 bin_type="jar", 4036 config=config, 4037 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4038 ) 4039 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4040 if not exomiser_bin_command: 4041 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4042 log.error(msg_err) 4043 raise ValueError(msg_err) 4044 4045 # Param 4046 param = self.get_param() 4047 log.debug("Param: " + str(param)) 4048 4049 # Param - Exomiser 4050 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4051 log.debug(f"Param Exomiser: {param_exomiser}") 4052 4053 # Param - Assembly 4054 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4055 log.debug("Assembly: " + str(assembly)) 4056 4057 # Data 4058 table_variants = self.get_table_variants() 4059 4060 # Check if not empty 4061 log.debug("Check if not empty") 4062 sql_query_chromosomes = ( 4063 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4064 ) 4065 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4066 log.info(f"VCF empty") 4067 return False 4068 4069 # VCF header 4070 vcf_reader = self.get_header() 4071 log.debug("Initial header: " + str(vcf_reader.infos)) 4072 4073 # Samples 4074 samples = self.get_header_sample_list() 4075 if not samples: 4076 log.error("No Samples in VCF") 4077 return False 4078 log.debug(f"Samples: {samples}") 4079 4080 # Memory limit 4081 memory_limit = self.get_memory("8G") 4082 log.debug(f"memory_limit: {memory_limit}") 4083 4084 # Exomiser java options 4085 exomiser_java_options = ( 4086 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4087 ) 4088 log.debug(f"Exomiser java options: {exomiser_java_options}") 4089 4090 # Download Exomiser (if not exists) 4091 exomiser_release = param_exomiser.get("release", None) 4092 exomiser_application_properties = param_exomiser.get( 4093 "exomiser_application_properties", None 4094 ) 4095 databases_download_exomiser( 4096 assemblies=[assembly], 4097 exomiser_folder=databases_folders, 4098 exomiser_release=exomiser_release, 4099 exomiser_phenotype_release=exomiser_release, 4100 exomiser_application_properties=exomiser_application_properties, 4101 ) 4102 4103 # Force annotation 4104 force_update_annotation = True 4105 4106 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4107 log.debug("Start annotation Exomiser") 4108 4109 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4110 4111 # tmp_dir = "/tmp/exomiser" 4112 4113 ### ANALYSIS ### 4114 ################ 4115 4116 # Create analysis.json through analysis dict 4117 # either analysis in param or by default 4118 # depending on preset exome/genome) 4119 4120 # Init analysis dict 4121 param_exomiser_analysis_dict = {} 4122 4123 # analysis from param 4124 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4125 param_exomiser_analysis = full_path(param_exomiser_analysis) 4126 4127 # If analysis in param -> load anlaysis json 4128 if param_exomiser_analysis: 4129 4130 # If param analysis is a file and exists 4131 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4132 param_exomiser_analysis 4133 ): 4134 # Load analysis file into analysis dict (either yaml or json) 4135 with open(param_exomiser_analysis) as json_file: 4136 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4137 4138 # If param analysis is a dict 4139 elif isinstance(param_exomiser_analysis, dict): 4140 # Load analysis dict into analysis dict (either yaml or json) 4141 param_exomiser_analysis_dict = param_exomiser_analysis 4142 4143 # Error analysis type 4144 else: 4145 log.error(f"Analysis type unknown. Check param file.") 4146 raise ValueError(f"Analysis type unknown. Check param file.") 4147 4148 # Case no input analysis config file/dict 4149 # Use preset (exome/genome) to open default config file 4150 if not param_exomiser_analysis_dict: 4151 4152 # default preset 4153 default_preset = "exome" 4154 4155 # Get param preset or default preset 4156 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4157 4158 # Try to find if preset is a file 4159 if os.path.exists(param_exomiser_preset): 4160 # Preset file is provided in full path 4161 param_exomiser_analysis_default_config_file = ( 4162 param_exomiser_preset 4163 ) 4164 # elif os.path.exists(full_path(param_exomiser_preset)): 4165 # # Preset file is provided in full path 4166 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4167 elif os.path.exists( 4168 os.path.join(folder_config, param_exomiser_preset) 4169 ): 4170 # Preset file is provided a basename in config folder (can be a path with subfolders) 4171 param_exomiser_analysis_default_config_file = os.path.join( 4172 folder_config, param_exomiser_preset 4173 ) 4174 else: 4175 # Construct preset file 4176 param_exomiser_analysis_default_config_file = os.path.join( 4177 folder_config, 4178 f"preset-{param_exomiser_preset}-analysis.json", 4179 ) 4180 4181 # If preset file exists 4182 param_exomiser_analysis_default_config_file = full_path( 4183 param_exomiser_analysis_default_config_file 4184 ) 4185 if os.path.exists(param_exomiser_analysis_default_config_file): 4186 # Load prest file into analysis dict (either yaml or json) 4187 with open( 4188 param_exomiser_analysis_default_config_file 4189 ) as json_file: 4190 # param_exomiser_analysis_dict[""] = json.load(json_file) 4191 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4192 json_file 4193 ) 4194 4195 # Error preset file 4196 else: 4197 log.error( 4198 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4199 ) 4200 raise ValueError( 4201 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4202 ) 4203 4204 # If no analysis dict created 4205 if not param_exomiser_analysis_dict: 4206 log.error(f"No analysis config") 4207 raise ValueError(f"No analysis config") 4208 4209 # Log 4210 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4211 4212 ### PHENOPACKET ### 4213 ################### 4214 4215 # If no PhenoPacket in analysis dict -> check in param 4216 if "phenopacket" not in param_exomiser_analysis_dict: 4217 4218 # If PhenoPacket in param -> load anlaysis json 4219 if param_exomiser.get("phenopacket", None): 4220 4221 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4222 param_exomiser_phenopacket = full_path( 4223 param_exomiser_phenopacket 4224 ) 4225 4226 # If param phenopacket is a file and exists 4227 if isinstance( 4228 param_exomiser_phenopacket, str 4229 ) and os.path.exists(param_exomiser_phenopacket): 4230 # Load phenopacket file into analysis dict (either yaml or json) 4231 with open(param_exomiser_phenopacket) as json_file: 4232 param_exomiser_analysis_dict["phenopacket"] = ( 4233 yaml.safe_load(json_file) 4234 ) 4235 4236 # If param phenopacket is a dict 4237 elif isinstance(param_exomiser_phenopacket, dict): 4238 # Load phenopacket dict into analysis dict (either yaml or json) 4239 param_exomiser_analysis_dict["phenopacket"] = ( 4240 param_exomiser_phenopacket 4241 ) 4242 4243 # Error phenopacket type 4244 else: 4245 log.error(f"Phenopacket type unknown. Check param file.") 4246 raise ValueError( 4247 f"Phenopacket type unknown. Check param file." 4248 ) 4249 4250 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4251 if "phenopacket" not in param_exomiser_analysis_dict: 4252 4253 # Init PhenoPacket 4254 param_exomiser_analysis_dict["phenopacket"] = { 4255 "id": "analysis", 4256 "proband": {}, 4257 } 4258 4259 ### Add subject ### 4260 4261 # If subject exists 4262 param_exomiser_subject = param_exomiser.get("subject", {}) 4263 4264 # If subject not exists -> found sample ID 4265 if not param_exomiser_subject: 4266 4267 # Found sample ID in param 4268 sample = param_exomiser.get("sample", None) 4269 4270 # Find sample ID (first sample) 4271 if not sample: 4272 sample_list = self.get_header_sample_list() 4273 if len(sample_list) > 0: 4274 sample = sample_list[0] 4275 else: 4276 log.error(f"No sample found") 4277 raise ValueError(f"No sample found") 4278 4279 # Create subject 4280 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4281 4282 # Add to dict 4283 param_exomiser_analysis_dict["phenopacket"][ 4284 "subject" 4285 ] = param_exomiser_subject 4286 4287 ### Add "phenotypicFeatures" ### 4288 4289 # If phenotypicFeatures exists 4290 param_exomiser_phenotypicfeatures = param_exomiser.get( 4291 "phenotypicFeatures", [] 4292 ) 4293 4294 # If phenotypicFeatures not exists -> Try to infer from hpo list 4295 if not param_exomiser_phenotypicfeatures: 4296 4297 # Found HPO in param 4298 param_exomiser_hpo = param_exomiser.get("hpo", []) 4299 4300 # Split HPO if list in string format separated by comma 4301 if isinstance(param_exomiser_hpo, str): 4302 param_exomiser_hpo = param_exomiser_hpo.split(",") 4303 4304 # Create HPO list 4305 for hpo in param_exomiser_hpo: 4306 hpo_clean = re.sub("[^0-9]", "", hpo) 4307 param_exomiser_phenotypicfeatures.append( 4308 { 4309 "type": { 4310 "id": f"HP:{hpo_clean}", 4311 "label": f"HP:{hpo_clean}", 4312 } 4313 } 4314 ) 4315 4316 # Add to dict 4317 param_exomiser_analysis_dict["phenopacket"][ 4318 "phenotypicFeatures" 4319 ] = param_exomiser_phenotypicfeatures 4320 4321 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4322 if not param_exomiser_phenotypicfeatures: 4323 for step in param_exomiser_analysis_dict.get( 4324 "analysis", {} 4325 ).get("steps", []): 4326 if "hiPhivePrioritiser" in step: 4327 param_exomiser_analysis_dict.get("analysis", {}).get( 4328 "steps", [] 4329 ).remove(step) 4330 4331 ### Add Input File ### 4332 4333 # Initial file name and htsFiles 4334 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4335 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4336 { 4337 "uri": tmp_vcf_name, 4338 "htsFormat": "VCF", 4339 "genomeAssembly": assembly, 4340 } 4341 ] 4342 4343 ### Add metaData ### 4344 4345 # If metaData not in analysis dict 4346 if "metaData" not in param_exomiser_analysis_dict: 4347 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4348 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4349 "createdBy": "howard", 4350 "phenopacketSchemaVersion": 1, 4351 } 4352 4353 ### OutputOptions ### 4354 4355 # Init output result folder 4356 output_results = os.path.join(tmp_dir, "results") 4357 4358 # If no outputOptions in analysis dict 4359 if "outputOptions" not in param_exomiser_analysis_dict: 4360 4361 # default output formats 4362 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4363 4364 # Get outputOptions in param 4365 output_options = param_exomiser.get("outputOptions", None) 4366 4367 # If no output_options in param -> check 4368 if not output_options: 4369 output_options = { 4370 "outputContributingVariantsOnly": False, 4371 "numGenes": 0, 4372 "outputFormats": defaut_output_formats, 4373 } 4374 4375 # Replace outputDirectory in output options 4376 output_options["outputDirectory"] = output_results 4377 output_options["outputFileName"] = "howard" 4378 4379 # Add outputOptions in analysis dict 4380 param_exomiser_analysis_dict["outputOptions"] = output_options 4381 4382 else: 4383 4384 # Replace output_results and output format (if exists in param) 4385 param_exomiser_analysis_dict["outputOptions"][ 4386 "outputDirectory" 4387 ] = output_results 4388 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4389 list( 4390 set( 4391 param_exomiser_analysis_dict.get( 4392 "outputOptions", {} 4393 ).get("outputFormats", []) 4394 + ["TSV_VARIANT", "VCF"] 4395 ) 4396 ) 4397 ) 4398 4399 # log 4400 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4401 4402 ### ANALYSIS FILE ### 4403 ##################### 4404 4405 ### Full JSON analysis config file ### 4406 4407 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4408 with open(exomiser_analysis, "w") as fp: 4409 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4410 4411 ### SPLIT analysis and sample config files 4412 4413 # Splitted analysis dict 4414 param_exomiser_analysis_dict_for_split = ( 4415 param_exomiser_analysis_dict.copy() 4416 ) 4417 4418 # Phenopacket JSON file 4419 exomiser_analysis_phenopacket = os.path.join( 4420 tmp_dir, "analysis_phenopacket.json" 4421 ) 4422 with open(exomiser_analysis_phenopacket, "w") as fp: 4423 json.dump( 4424 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4425 fp, 4426 indent=4, 4427 ) 4428 4429 # Analysis JSON file without Phenopacket parameters 4430 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4431 exomiser_analysis_analysis = os.path.join( 4432 tmp_dir, "analysis_analysis.json" 4433 ) 4434 with open(exomiser_analysis_analysis, "w") as fp: 4435 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4436 4437 ### INITAL VCF file ### 4438 ####################### 4439 4440 ### Create list of samples to use and include inti initial VCF file #### 4441 4442 # Subject (main sample) 4443 # Get sample ID in analysis dict 4444 sample_subject = ( 4445 param_exomiser_analysis_dict.get("phenopacket", {}) 4446 .get("subject", {}) 4447 .get("id", None) 4448 ) 4449 sample_proband = ( 4450 param_exomiser_analysis_dict.get("phenopacket", {}) 4451 .get("proband", {}) 4452 .get("subject", {}) 4453 .get("id", None) 4454 ) 4455 sample = [] 4456 if sample_subject: 4457 sample.append(sample_subject) 4458 if sample_proband: 4459 sample.append(sample_proband) 4460 4461 # Get sample ID within Pedigree 4462 pedigree_persons_list = ( 4463 param_exomiser_analysis_dict.get("phenopacket", {}) 4464 .get("pedigree", {}) 4465 .get("persons", {}) 4466 ) 4467 4468 # Create list with all sample ID in pedigree (if exists) 4469 pedigree_persons = [] 4470 for person in pedigree_persons_list: 4471 pedigree_persons.append(person.get("individualId")) 4472 4473 # Concat subject sample ID and samples ID in pedigreesamples 4474 samples = list(set(sample + pedigree_persons)) 4475 4476 # Check if sample list is not empty 4477 if not samples: 4478 log.error(f"No samples found") 4479 raise ValueError(f"No samples found") 4480 4481 # Create VCF with sample (either sample in param or first one by default) 4482 # Export VCF file 4483 self.export_variant_vcf( 4484 vcf_file=tmp_vcf_name, 4485 remove_info=True, 4486 add_samples=True, 4487 list_samples=samples, 4488 index=False, 4489 ) 4490 4491 ### Execute Exomiser ### 4492 ######################## 4493 4494 # Init command 4495 exomiser_command = "" 4496 4497 # Command exomiser options 4498 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4499 4500 # Release 4501 exomiser_release = param_exomiser.get("release", None) 4502 if exomiser_release: 4503 # phenotype data version 4504 exomiser_options += ( 4505 f" --exomiser.phenotype.data-version={exomiser_release} " 4506 ) 4507 # data version 4508 exomiser_options += ( 4509 f" --exomiser.{assembly}.data-version={exomiser_release} " 4510 ) 4511 # variant white list 4512 variant_white_list_file = ( 4513 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4514 ) 4515 if os.path.exists( 4516 os.path.join( 4517 databases_folders, assembly, variant_white_list_file 4518 ) 4519 ): 4520 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4521 4522 # transcript_source 4523 transcript_source = param_exomiser.get( 4524 "transcript_source", None 4525 ) # ucsc, refseq, ensembl 4526 if transcript_source: 4527 exomiser_options += ( 4528 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4529 ) 4530 4531 # If analysis contain proband param 4532 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4533 "proband", {} 4534 ): 4535 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4536 4537 # If no proband (usually uniq sample) 4538 else: 4539 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4540 4541 # Log 4542 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4543 4544 # Run command 4545 result = subprocess.call( 4546 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4547 ) 4548 if result: 4549 log.error("Exomiser command failed") 4550 raise ValueError("Exomiser command failed") 4551 4552 ### RESULTS ### 4553 ############### 4554 4555 ### Annotate with TSV fields ### 4556 4557 # Init result tsv file 4558 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4559 4560 # Init result tsv file 4561 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4562 4563 # Parse TSV file and explode columns in INFO field 4564 if exomiser_to_info and os.path.exists(output_results_tsv): 4565 4566 # Log 4567 log.debug("Exomiser columns to VCF INFO field") 4568 4569 # Retrieve columns and types 4570 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4571 output_results_tsv_df = self.get_query_to_df(query) 4572 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4573 4574 # Init concat fields for update 4575 sql_query_update_concat_fields = [] 4576 4577 # Fields to avoid 4578 fields_to_avoid = [ 4579 "CONTIG", 4580 "START", 4581 "END", 4582 "REF", 4583 "ALT", 4584 "QUAL", 4585 "FILTER", 4586 "GENOTYPE", 4587 ] 4588 4589 # List all columns to add into header 4590 for header_column in output_results_tsv_columns: 4591 4592 # If header column is enable 4593 if header_column not in fields_to_avoid: 4594 4595 # Header info type 4596 header_info_type = "String" 4597 header_column_df = output_results_tsv_df[header_column] 4598 header_column_df_dtype = header_column_df.dtype 4599 if header_column_df_dtype == object: 4600 if ( 4601 pd.to_numeric(header_column_df, errors="coerce") 4602 .notnull() 4603 .all() 4604 ): 4605 header_info_type = "Float" 4606 else: 4607 header_info_type = "Integer" 4608 4609 # Header info 4610 characters_to_validate = ["-"] 4611 pattern = "[" + "".join(characters_to_validate) + "]" 4612 header_info_name = re.sub( 4613 pattern, 4614 "_", 4615 f"Exomiser_{header_column}".replace("#", ""), 4616 ) 4617 header_info_number = "." 4618 header_info_description = ( 4619 f"Exomiser {header_column} annotation" 4620 ) 4621 header_info_source = "Exomiser" 4622 header_info_version = "unknown" 4623 header_info_code = CODE_TYPE_MAP[header_info_type] 4624 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4625 header_info_name, 4626 header_info_number, 4627 header_info_type, 4628 header_info_description, 4629 header_info_source, 4630 header_info_version, 4631 header_info_code, 4632 ) 4633 4634 # Add field to add for update to concat fields 4635 sql_query_update_concat_fields.append( 4636 f""" 4637 CASE 4638 WHEN table_parquet."{header_column}" NOT IN ('','.') 4639 THEN concat( 4640 '{header_info_name}=', 4641 table_parquet."{header_column}", 4642 ';' 4643 ) 4644 4645 ELSE '' 4646 END 4647 """ 4648 ) 4649 4650 # Update query 4651 sql_query_update = f""" 4652 UPDATE {table_variants} as table_variants 4653 SET INFO = concat( 4654 CASE 4655 WHEN INFO NOT IN ('', '.') 4656 THEN INFO 4657 ELSE '' 4658 END, 4659 CASE 4660 WHEN table_variants.INFO NOT IN ('','.') 4661 THEN ';' 4662 ELSE '' 4663 END, 4664 ( 4665 SELECT 4666 concat( 4667 {",".join(sql_query_update_concat_fields)} 4668 ) 4669 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4670 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4671 AND table_parquet.\"START\" = table_variants.\"POS\" 4672 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4673 AND table_parquet.\"REF\" = table_variants.\"REF\" 4674 ) 4675 ) 4676 ; 4677 """ 4678 4679 # Update 4680 self.conn.execute(sql_query_update) 4681 4682 ### Annotate with VCF INFO field ### 4683 4684 # Init result VCF file 4685 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4686 4687 # If VCF exists 4688 if os.path.exists(output_results_vcf): 4689 4690 # Log 4691 log.debug("Exomiser result VCF update variants") 4692 4693 # Find Exomiser INFO field annotation in header 4694 with gzip.open(output_results_vcf, "rt") as f: 4695 header_list = self.read_vcf_header(f) 4696 exomiser_vcf_header = vcf.Reader( 4697 io.StringIO("\n".join(header_list)) 4698 ) 4699 4700 # Add annotation INFO field to header 4701 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4702 4703 # Update variants with VCF 4704 self.update_from_vcf(output_results_vcf) 4705 4706 return True 4707 4708 def annotation_snpeff(self, threads: int = None) -> None: 4709 """ 4710 This function annotate with snpEff 4711 4712 :param threads: The number of threads to use 4713 :return: the value of the variable "return_value". 4714 """ 4715 4716 # DEBUG 4717 log.debug("Start annotation with snpeff databases") 4718 4719 # Threads 4720 if not threads: 4721 threads = self.get_threads() 4722 log.debug("Threads: " + str(threads)) 4723 4724 # DEBUG 4725 delete_tmp = True 4726 if self.get_config().get("verbosity", "warning") in ["debug"]: 4727 delete_tmp = False 4728 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4729 4730 # Config 4731 config = self.get_config() 4732 log.debug("Config: " + str(config)) 4733 4734 # Config - Folders - Databases 4735 databases_folders = ( 4736 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4737 ) 4738 log.debug("Databases annotations: " + str(databases_folders)) 4739 4740 # # Config - Java 4741 # java_bin = get_bin( 4742 # tool="java", 4743 # bin="java", 4744 # bin_type="bin", 4745 # config=config, 4746 # default_folder="/usr/bin", 4747 # ) 4748 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4749 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4750 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4751 4752 # # Config - snpEff bin 4753 # snpeff_jar = get_bin( 4754 # tool="snpeff", 4755 # bin="snpEff.jar", 4756 # bin_type="jar", 4757 # config=config, 4758 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4759 # ) 4760 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4761 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4762 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 4764 # Config - snpEff bin command 4765 snpeff_bin_command = get_bin_command( 4766 bin="snpEff.jar", 4767 tool="snpeff", 4768 bin_type="jar", 4769 config=config, 4770 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4771 ) 4772 if not snpeff_bin_command: 4773 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4774 log.error(msg_err) 4775 raise ValueError(msg_err) 4776 4777 # Config - snpEff databases 4778 snpeff_databases = ( 4779 config.get("folders", {}) 4780 .get("databases", {}) 4781 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4782 ) 4783 snpeff_databases = full_path(snpeff_databases) 4784 if snpeff_databases is not None and snpeff_databases != "": 4785 log.debug(f"Create snpEff databases folder") 4786 if not os.path.exists(snpeff_databases): 4787 os.makedirs(snpeff_databases) 4788 4789 # Param 4790 param = self.get_param() 4791 log.debug("Param: " + str(param)) 4792 4793 # Param 4794 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4795 log.debug("Options: " + str(options)) 4796 4797 # Param - Assembly 4798 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4799 4800 # Param - Options 4801 snpeff_options = ( 4802 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4803 ) 4804 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4805 snpeff_csvstats = ( 4806 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4807 ) 4808 if snpeff_stats: 4809 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4810 snpeff_stats = full_path(snpeff_stats) 4811 snpeff_options += f" -stats {snpeff_stats}" 4812 if snpeff_csvstats: 4813 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4814 snpeff_csvstats = full_path(snpeff_csvstats) 4815 snpeff_options += f" -csvStats {snpeff_csvstats}" 4816 4817 # Data 4818 table_variants = self.get_table_variants() 4819 4820 # Check if not empty 4821 log.debug("Check if not empty") 4822 sql_query_chromosomes = ( 4823 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4824 ) 4825 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4826 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4827 log.info(f"VCF empty") 4828 return 4829 4830 # Export in VCF 4831 log.debug("Create initial file to annotate") 4832 tmp_vcf = NamedTemporaryFile( 4833 prefix=self.get_prefix(), 4834 dir=self.get_tmp_dir(), 4835 suffix=".vcf.gz", 4836 delete=True, 4837 ) 4838 tmp_vcf_name = tmp_vcf.name 4839 4840 # VCF header 4841 vcf_reader = self.get_header() 4842 log.debug("Initial header: " + str(vcf_reader.infos)) 4843 4844 # Existing annotations 4845 for vcf_annotation in self.get_header().infos: 4846 4847 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4848 log.debug( 4849 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4850 ) 4851 4852 # Memory limit 4853 # if config.get("memory", None): 4854 # memory_limit = config.get("memory", "8G") 4855 # else: 4856 # memory_limit = "8G" 4857 memory_limit = self.get_memory("8G") 4858 log.debug(f"memory_limit: {memory_limit}") 4859 4860 # snpEff java options 4861 snpeff_java_options = ( 4862 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4863 ) 4864 log.debug(f"Exomiser java options: {snpeff_java_options}") 4865 4866 force_update_annotation = True 4867 4868 if "ANN" not in self.get_header().infos or force_update_annotation: 4869 4870 # Check snpEff database 4871 log.debug(f"Check snpEff databases {[assembly]}") 4872 databases_download_snpeff( 4873 folder=snpeff_databases, assemblies=[assembly], config=config 4874 ) 4875 4876 # Export VCF file 4877 self.export_variant_vcf( 4878 vcf_file=tmp_vcf_name, 4879 remove_info=True, 4880 add_samples=False, 4881 index=True, 4882 ) 4883 4884 # Tmp file 4885 err_files = [] 4886 tmp_annotate_vcf = NamedTemporaryFile( 4887 prefix=self.get_prefix(), 4888 dir=self.get_tmp_dir(), 4889 suffix=".vcf", 4890 delete=False, 4891 ) 4892 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4893 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4894 err_files.append(tmp_annotate_vcf_name_err) 4895 4896 # Command 4897 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4898 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4899 run_parallel_commands([snpeff_command], 1) 4900 4901 # Error messages 4902 log.info(f"Error/Warning messages:") 4903 error_message_command_all = [] 4904 error_message_command_warning = [] 4905 error_message_command_err = [] 4906 for err_file in err_files: 4907 with open(err_file, "r") as f: 4908 for line in f: 4909 message = line.strip() 4910 error_message_command_all.append(message) 4911 if line.startswith("[W::"): 4912 error_message_command_warning.append(message) 4913 if line.startswith("[E::"): 4914 error_message_command_err.append(f"{err_file}: " + message) 4915 # log info 4916 for message in list( 4917 set(error_message_command_err + error_message_command_warning) 4918 ): 4919 log.info(f" {message}") 4920 # debug info 4921 for message in list(set(error_message_command_all)): 4922 log.debug(f" {message}") 4923 # failed 4924 if len(error_message_command_err): 4925 log.error("Annotation failed: Error in commands") 4926 raise ValueError("Annotation failed: Error in commands") 4927 4928 # Find annotation in header 4929 with open(tmp_annotate_vcf_name, "rt") as f: 4930 header_list = self.read_vcf_header(f) 4931 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4932 4933 for ann in annovar_vcf_header.infos: 4934 if ann not in self.get_header().infos: 4935 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4936 4937 # Update variants 4938 log.info(f"Annotation - Updating...") 4939 self.update_from_vcf(tmp_annotate_vcf_name) 4940 4941 else: 4942 if "ANN" in self.get_header().infos: 4943 log.debug(f"Existing snpEff annotations in VCF") 4944 if force_update_annotation: 4945 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4946 4947 def annotation_annovar(self, threads: int = None) -> None: 4948 """ 4949 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4950 annotations 4951 4952 :param threads: number of threads to use 4953 :return: the value of the variable "return_value". 4954 """ 4955 4956 # DEBUG 4957 log.debug("Start annotation with Annovar databases") 4958 4959 # Threads 4960 if not threads: 4961 threads = self.get_threads() 4962 log.debug("Threads: " + str(threads)) 4963 4964 # Tmp en Err files 4965 tmp_files = [] 4966 err_files = [] 4967 4968 # DEBUG 4969 delete_tmp = True 4970 if self.get_config().get("verbosity", "warning") in ["debug"]: 4971 delete_tmp = False 4972 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4973 4974 # Config 4975 config = self.get_config() 4976 log.debug("Config: " + str(config)) 4977 4978 # Config - Folders - Databases 4979 databases_folders = ( 4980 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4981 ) 4982 log.debug("Databases annotations: " + str(databases_folders)) 4983 4984 # Config - annovar bin command 4985 annovar_bin_command = get_bin_command( 4986 bin="table_annovar.pl", 4987 tool="annovar", 4988 bin_type="perl", 4989 config=config, 4990 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4991 ) 4992 if not annovar_bin_command: 4993 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4994 log.error(msg_err) 4995 raise ValueError(msg_err) 4996 4997 # Config - BCFTools bin command 4998 bcftools_bin_command = get_bin_command( 4999 bin="bcftools", 5000 tool="bcftools", 5001 bin_type="bin", 5002 config=config, 5003 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5004 ) 5005 if not bcftools_bin_command: 5006 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5007 log.error(msg_err) 5008 raise ValueError(msg_err) 5009 5010 # Config - annovar databases 5011 annovar_databases = ( 5012 config.get("folders", {}) 5013 .get("databases", {}) 5014 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5015 ) 5016 annovar_databases = full_path(annovar_databases) 5017 if annovar_databases != "" and not os.path.exists(annovar_databases): 5018 os.makedirs(annovar_databases) 5019 5020 # Param 5021 param = self.get_param() 5022 log.debug("Param: " + str(param)) 5023 5024 # Param - options 5025 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5026 log.debug("Options: " + str(options)) 5027 5028 # Param - annotations 5029 annotations = ( 5030 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5031 ) 5032 log.debug("Annotations: " + str(annotations)) 5033 5034 # Param - Assembly 5035 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5036 5037 # Annovar database assembly 5038 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5039 if annovar_databases_assembly != "" and not os.path.exists( 5040 annovar_databases_assembly 5041 ): 5042 os.makedirs(annovar_databases_assembly) 5043 5044 # Data 5045 table_variants = self.get_table_variants() 5046 5047 # Check if not empty 5048 log.debug("Check if not empty") 5049 sql_query_chromosomes = ( 5050 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5051 ) 5052 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5053 if not sql_query_chromosomes_df["count"][0]: 5054 log.info(f"VCF empty") 5055 return 5056 5057 # VCF header 5058 vcf_reader = self.get_header() 5059 log.debug("Initial header: " + str(vcf_reader.infos)) 5060 5061 # Existing annotations 5062 for vcf_annotation in self.get_header().infos: 5063 5064 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5065 log.debug( 5066 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5067 ) 5068 5069 force_update_annotation = True 5070 5071 if annotations: 5072 5073 commands = [] 5074 tmp_annotates_vcf_name_list = [] 5075 5076 # Export in VCF 5077 log.debug("Create initial file to annotate") 5078 tmp_vcf = NamedTemporaryFile( 5079 prefix=self.get_prefix(), 5080 dir=self.get_tmp_dir(), 5081 suffix=".vcf.gz", 5082 delete=False, 5083 ) 5084 tmp_vcf_name = tmp_vcf.name 5085 tmp_files.append(tmp_vcf_name) 5086 tmp_files.append(tmp_vcf_name + ".tbi") 5087 5088 # Export VCF file 5089 self.export_variant_vcf( 5090 vcf_file=tmp_vcf_name, 5091 remove_info=".", 5092 add_samples=False, 5093 index=True, 5094 ) 5095 5096 # Create file for field rename 5097 log.debug("Create file for field rename") 5098 tmp_rename = NamedTemporaryFile( 5099 prefix=self.get_prefix(), 5100 dir=self.get_tmp_dir(), 5101 suffix=".rename", 5102 delete=False, 5103 ) 5104 tmp_rename_name = tmp_rename.name 5105 tmp_files.append(tmp_rename_name) 5106 5107 # Check Annovar database 5108 log.debug( 5109 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5110 ) 5111 databases_download_annovar( 5112 folder=annovar_databases, 5113 files=list(annotations.keys()), 5114 assemblies=[assembly], 5115 ) 5116 5117 for annotation in annotations: 5118 annotation_fields = annotations[annotation] 5119 5120 if not annotation_fields: 5121 annotation_fields = {"INFO": None} 5122 5123 log.info(f"Annotations Annovar - database '{annotation}'") 5124 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5125 5126 # Tmp file for annovar 5127 err_files = [] 5128 tmp_annotate_vcf_directory = TemporaryDirectory( 5129 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5130 ) 5131 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5132 tmp_annotate_vcf_name_annovar = ( 5133 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5134 ) 5135 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5136 err_files.append(tmp_annotate_vcf_name_err) 5137 tmp_files.append(tmp_annotate_vcf_name_err) 5138 5139 # Tmp file final vcf annotated by annovar 5140 tmp_annotate_vcf = NamedTemporaryFile( 5141 prefix=self.get_prefix(), 5142 dir=self.get_tmp_dir(), 5143 suffix=".vcf.gz", 5144 delete=False, 5145 ) 5146 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5147 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5148 tmp_files.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5150 5151 # Number of fields 5152 annotation_list = [] 5153 annotation_renamed_list = [] 5154 5155 for annotation_field in annotation_fields: 5156 5157 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5158 annotation_fields_new_name = annotation_fields.get( 5159 annotation_field, annotation_field 5160 ) 5161 if not annotation_fields_new_name: 5162 annotation_fields_new_name = annotation_field 5163 5164 if ( 5165 force_update_annotation 5166 or annotation_fields_new_name not in self.get_header().infos 5167 ): 5168 annotation_list.append(annotation_field) 5169 annotation_renamed_list.append(annotation_fields_new_name) 5170 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5171 log.warning( 5172 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5173 ) 5174 5175 # Add rename info 5176 run_parallel_commands( 5177 [ 5178 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5179 ], 5180 1, 5181 ) 5182 5183 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5184 log.debug("annotation_list: " + str(annotation_list)) 5185 5186 # protocol 5187 protocol = annotation 5188 5189 # argument 5190 argument = "" 5191 5192 # operation 5193 operation = "f" 5194 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5195 "ensGene" 5196 ): 5197 operation = "g" 5198 if options.get("genebase", None): 5199 argument = f"""'{options.get("genebase","")}'""" 5200 elif annotation in ["cytoBand"]: 5201 operation = "r" 5202 5203 # argument option 5204 argument_option = "" 5205 if argument != "": 5206 argument_option = " --argument " + argument 5207 5208 # command options 5209 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5210 for option in options: 5211 if option not in ["genebase"]: 5212 command_options += f""" --{option}={options[option]}""" 5213 5214 # Command 5215 5216 # Command - Annovar 5217 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5218 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5219 5220 # Command - start pipe 5221 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5222 5223 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5224 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5225 5226 # Command - Special characters (refGene annotation) 5227 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5228 5229 # Command - Clean empty fields (with value ".") 5230 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5231 5232 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5233 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5234 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5235 # for ann in annotation_renamed_list: 5236 for ann in annotation_list: 5237 annovar_fields_to_keep.append(f"^INFO/{ann}") 5238 5239 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5240 5241 # Command - indexing 5242 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5243 5244 log.debug(f"Annotation - Annovar command: {command_annovar}") 5245 run_parallel_commands([command_annovar], 1) 5246 5247 # Error messages 5248 log.info(f"Error/Warning messages:") 5249 error_message_command_all = [] 5250 error_message_command_warning = [] 5251 error_message_command_err = [] 5252 for err_file in err_files: 5253 with open(err_file, "r") as f: 5254 for line in f: 5255 message = line.strip() 5256 error_message_command_all.append(message) 5257 if line.startswith("[W::") or line.startswith("WARNING"): 5258 error_message_command_warning.append(message) 5259 if line.startswith("[E::") or line.startswith("ERROR"): 5260 error_message_command_err.append( 5261 f"{err_file}: " + message 5262 ) 5263 # log info 5264 for message in list( 5265 set(error_message_command_err + error_message_command_warning) 5266 ): 5267 log.info(f" {message}") 5268 # debug info 5269 for message in list(set(error_message_command_all)): 5270 log.debug(f" {message}") 5271 # failed 5272 if len(error_message_command_err): 5273 log.error("Annotation failed: Error in commands") 5274 raise ValueError("Annotation failed: Error in commands") 5275 5276 if tmp_annotates_vcf_name_list: 5277 5278 # List of annotated files 5279 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5280 5281 # Tmp file 5282 tmp_annotate_vcf = NamedTemporaryFile( 5283 prefix=self.get_prefix(), 5284 dir=self.get_tmp_dir(), 5285 suffix=".vcf.gz", 5286 delete=False, 5287 ) 5288 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5289 tmp_files.append(tmp_annotate_vcf_name) 5290 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5291 err_files.append(tmp_annotate_vcf_name_err) 5292 tmp_files.append(tmp_annotate_vcf_name_err) 5293 5294 # Command merge 5295 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5296 log.info( 5297 f"Annotation Annovar - Annotation merging " 5298 + str(len(tmp_annotates_vcf_name_list)) 5299 + " annotated files" 5300 ) 5301 log.debug(f"Annotation - merge command: {merge_command}") 5302 run_parallel_commands([merge_command], 1) 5303 5304 # Find annotation in header 5305 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5306 header_list = self.read_vcf_header(f) 5307 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5308 5309 for ann in annovar_vcf_header.infos: 5310 if ann not in self.get_header().infos: 5311 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5312 5313 # Update variants 5314 log.info(f"Annotation Annovar - Updating...") 5315 self.update_from_vcf(tmp_annotate_vcf_name) 5316 5317 # Clean files 5318 # Tmp file remove command 5319 if True: 5320 tmp_files_remove_command = "" 5321 if tmp_files: 5322 tmp_files_remove_command = " ".join(tmp_files) 5323 clean_command = f" rm -f {tmp_files_remove_command} " 5324 log.debug(f"Annotation Annovar - Annotation cleaning ") 5325 log.debug(f"Annotation - cleaning command: {clean_command}") 5326 run_parallel_commands([clean_command], 1) 5327 5328 # Parquet 5329 def annotation_parquet(self, threads: int = None) -> None: 5330 """ 5331 It takes a VCF file, and annotates it with a parquet file 5332 5333 :param threads: number of threads to use for the annotation 5334 :return: the value of the variable "result". 5335 """ 5336 5337 # DEBUG 5338 log.debug("Start annotation with parquet databases") 5339 5340 # Threads 5341 if not threads: 5342 threads = self.get_threads() 5343 log.debug("Threads: " + str(threads)) 5344 5345 # DEBUG 5346 delete_tmp = True 5347 if self.get_config().get("verbosity", "warning") in ["debug"]: 5348 delete_tmp = False 5349 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5350 5351 # Config 5352 databases_folders = set( 5353 self.get_config() 5354 .get("folders", {}) 5355 .get("databases", {}) 5356 .get("annotations", ["."]) 5357 + self.get_config() 5358 .get("folders", {}) 5359 .get("databases", {}) 5360 .get("parquet", ["."]) 5361 ) 5362 log.debug("Databases annotations: " + str(databases_folders)) 5363 5364 # Param 5365 annotations = ( 5366 self.get_param() 5367 .get("annotation", {}) 5368 .get("parquet", {}) 5369 .get("annotations", None) 5370 ) 5371 log.debug("Annotations: " + str(annotations)) 5372 5373 # Assembly 5374 assembly = self.get_param().get( 5375 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5376 ) 5377 5378 # Force Update Annotation 5379 force_update_annotation = ( 5380 self.get_param() 5381 .get("annotation", {}) 5382 .get("options", {}) 5383 .get("annotations_update", False) 5384 ) 5385 log.debug(f"force_update_annotation={force_update_annotation}") 5386 force_append_annotation = ( 5387 self.get_param() 5388 .get("annotation", {}) 5389 .get("options", {}) 5390 .get("annotations_append", False) 5391 ) 5392 log.debug(f"force_append_annotation={force_append_annotation}") 5393 5394 # Data 5395 table_variants = self.get_table_variants() 5396 5397 # Check if not empty 5398 log.debug("Check if not empty") 5399 sql_query_chromosomes_df = self.get_query_to_df( 5400 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5401 ) 5402 if not sql_query_chromosomes_df["count"][0]: 5403 log.info(f"VCF empty") 5404 return 5405 5406 # VCF header 5407 vcf_reader = self.get_header() 5408 log.debug("Initial header: " + str(vcf_reader.infos)) 5409 5410 # Nb Variants POS 5411 log.debug("NB Variants Start") 5412 nb_variants = self.conn.execute( 5413 f"SELECT count(*) AS count FROM variants" 5414 ).fetchdf()["count"][0] 5415 log.debug("NB Variants Stop") 5416 5417 # Existing annotations 5418 for vcf_annotation in self.get_header().infos: 5419 5420 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5421 log.debug( 5422 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5423 ) 5424 5425 # Added columns 5426 added_columns = [] 5427 5428 # drop indexes 5429 log.debug(f"Drop indexes...") 5430 self.drop_indexes() 5431 5432 if annotations: 5433 5434 if "ALL" in annotations: 5435 5436 all_param = annotations.get("ALL", {}) 5437 all_param_formats = all_param.get("formats", None) 5438 all_param_releases = all_param.get("releases", None) 5439 5440 databases_infos_dict = self.scan_databases( 5441 database_formats=all_param_formats, 5442 database_releases=all_param_releases, 5443 ) 5444 for database_infos in databases_infos_dict.keys(): 5445 if database_infos not in annotations: 5446 annotations[database_infos] = {"INFO": None} 5447 5448 for annotation in annotations: 5449 5450 if annotation in ["ALL"]: 5451 continue 5452 5453 # Annotation Name 5454 annotation_name = os.path.basename(annotation) 5455 5456 # Annotation fields 5457 annotation_fields = annotations[annotation] 5458 if not annotation_fields: 5459 annotation_fields = {"INFO": None} 5460 5461 log.debug(f"Annotation '{annotation_name}'") 5462 log.debug( 5463 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5464 ) 5465 5466 # Create Database 5467 database = Database( 5468 database=annotation, 5469 databases_folders=databases_folders, 5470 assembly=assembly, 5471 ) 5472 5473 # Find files 5474 parquet_file = database.get_database() 5475 parquet_hdr_file = database.get_header_file() 5476 parquet_type = database.get_type() 5477 5478 # Check if files exists 5479 if not parquet_file or not parquet_hdr_file: 5480 log.error("Annotation failed: file not found") 5481 raise ValueError("Annotation failed: file not found") 5482 else: 5483 # Get parquet connexion 5484 parquet_sql_attach = database.get_sql_database_attach( 5485 output="query" 5486 ) 5487 if parquet_sql_attach: 5488 self.conn.execute(parquet_sql_attach) 5489 parquet_file_link = database.get_sql_database_link() 5490 # Log 5491 log.debug( 5492 f"Annotation '{annotation_name}' - file: " 5493 + str(parquet_file) 5494 + " and " 5495 + str(parquet_hdr_file) 5496 ) 5497 5498 # Database full header columns 5499 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5500 parquet_hdr_file 5501 ) 5502 # Log 5503 log.debug( 5504 "Annotation database header columns : " 5505 + str(parquet_hdr_vcf_header_columns) 5506 ) 5507 5508 # Load header as VCF object 5509 parquet_hdr_vcf_header_infos = database.get_header().infos 5510 # Log 5511 log.debug( 5512 "Annotation database header: " 5513 + str(parquet_hdr_vcf_header_infos) 5514 ) 5515 5516 # Get extra infos 5517 parquet_columns = database.get_extra_columns() 5518 # Log 5519 log.debug("Annotation database Columns: " + str(parquet_columns)) 5520 5521 # Add extra columns if "ALL" in annotation_fields 5522 # if "ALL" in annotation_fields: 5523 # allow_add_extra_column = True 5524 if "ALL" in annotation_fields and database.get_extra_columns(): 5525 for extra_column in database.get_extra_columns(): 5526 if ( 5527 extra_column not in annotation_fields 5528 and extra_column.replace("INFO/", "") 5529 not in parquet_hdr_vcf_header_infos 5530 ): 5531 parquet_hdr_vcf_header_infos[extra_column] = ( 5532 vcf.parser._Info( 5533 extra_column, 5534 ".", 5535 "String", 5536 f"{extra_column} description", 5537 "unknown", 5538 "unknown", 5539 self.code_type_map["String"], 5540 ) 5541 ) 5542 5543 # For all fields in database 5544 annotation_fields_all = False 5545 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5546 annotation_fields_all = True 5547 annotation_fields = { 5548 key: key for key in parquet_hdr_vcf_header_infos 5549 } 5550 5551 log.debug( 5552 "Annotation database header - All annotations added: " 5553 + str(annotation_fields) 5554 ) 5555 5556 # Init 5557 5558 # List of annotation fields to use 5559 sql_query_annotation_update_info_sets = [] 5560 5561 # List of annotation to agregate 5562 sql_query_annotation_to_agregate = [] 5563 5564 # Number of fields 5565 nb_annotation_field = 0 5566 5567 # Annotation fields processed 5568 annotation_fields_processed = [] 5569 5570 # Columns mapping 5571 map_columns = database.map_columns( 5572 columns=annotation_fields, prefixes=["INFO/"] 5573 ) 5574 5575 # Query dict for fields to remove (update option) 5576 query_dict_remove = {} 5577 5578 # Fetch Anotation fields 5579 for annotation_field in annotation_fields: 5580 5581 # annotation_field_column 5582 annotation_field_column = map_columns.get( 5583 annotation_field, "INFO" 5584 ) 5585 5586 # field new name, if parametered 5587 annotation_fields_new_name = annotation_fields.get( 5588 annotation_field, annotation_field 5589 ) 5590 if not annotation_fields_new_name: 5591 annotation_fields_new_name = annotation_field 5592 5593 # To annotate 5594 # force_update_annotation = True 5595 # force_append_annotation = True 5596 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5597 if annotation_field in parquet_hdr_vcf_header_infos and ( 5598 force_update_annotation 5599 or force_append_annotation 5600 or ( 5601 annotation_fields_new_name 5602 not in self.get_header().infos 5603 ) 5604 ): 5605 5606 # Add field to annotation to process list 5607 annotation_fields_processed.append( 5608 annotation_fields_new_name 5609 ) 5610 5611 # explode infos for the field 5612 annotation_fields_new_name_info_msg = "" 5613 if ( 5614 force_update_annotation 5615 and annotation_fields_new_name 5616 in self.get_header().infos 5617 ): 5618 # Remove field from INFO 5619 query = f""" 5620 UPDATE {table_variants} as table_variants 5621 SET INFO = REGEXP_REPLACE( 5622 concat(table_variants.INFO,''), 5623 ';*{annotation_fields_new_name}=[^;]*', 5624 '' 5625 ) 5626 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5627 """ 5628 annotation_fields_new_name_info_msg = " [update]" 5629 query_dict_remove[ 5630 f"remove 'INFO/{annotation_fields_new_name}'" 5631 ] = query 5632 5633 # Sep between fields in INFO 5634 nb_annotation_field += 1 5635 if nb_annotation_field > 1: 5636 annotation_field_sep = ";" 5637 else: 5638 annotation_field_sep = "" 5639 5640 log.info( 5641 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5642 ) 5643 5644 # Add INFO field to header 5645 parquet_hdr_vcf_header_infos_number = ( 5646 parquet_hdr_vcf_header_infos[annotation_field].num 5647 or "." 5648 ) 5649 parquet_hdr_vcf_header_infos_type = ( 5650 parquet_hdr_vcf_header_infos[annotation_field].type 5651 or "String" 5652 ) 5653 parquet_hdr_vcf_header_infos_description = ( 5654 parquet_hdr_vcf_header_infos[annotation_field].desc 5655 or f"{annotation_field} description" 5656 ) 5657 parquet_hdr_vcf_header_infos_source = ( 5658 parquet_hdr_vcf_header_infos[annotation_field].source 5659 or "unknown" 5660 ) 5661 parquet_hdr_vcf_header_infos_version = ( 5662 parquet_hdr_vcf_header_infos[annotation_field].version 5663 or "unknown" 5664 ) 5665 5666 vcf_reader.infos[annotation_fields_new_name] = ( 5667 vcf.parser._Info( 5668 annotation_fields_new_name, 5669 parquet_hdr_vcf_header_infos_number, 5670 parquet_hdr_vcf_header_infos_type, 5671 parquet_hdr_vcf_header_infos_description, 5672 parquet_hdr_vcf_header_infos_source, 5673 parquet_hdr_vcf_header_infos_version, 5674 self.code_type_map[ 5675 parquet_hdr_vcf_header_infos_type 5676 ], 5677 ) 5678 ) 5679 5680 # Append 5681 if force_append_annotation: 5682 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5683 else: 5684 query_case_when_append = "" 5685 5686 # Annotation/Update query fields 5687 # Found in INFO column 5688 if ( 5689 annotation_field_column == "INFO" 5690 and "INFO" in parquet_hdr_vcf_header_columns 5691 ): 5692 sql_query_annotation_update_info_sets.append( 5693 f""" 5694 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5695 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5696 ELSE '' 5697 END 5698 """ 5699 ) 5700 # Found in a specific column 5701 else: 5702 sql_query_annotation_update_info_sets.append( 5703 f""" 5704 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5705 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5706 ELSE '' 5707 END 5708 """ 5709 ) 5710 sql_query_annotation_to_agregate.append( 5711 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5712 ) 5713 5714 # Not to annotate 5715 else: 5716 5717 if force_update_annotation: 5718 annotation_message = "forced" 5719 else: 5720 annotation_message = "skipped" 5721 5722 if annotation_field not in parquet_hdr_vcf_header_infos: 5723 log.warning( 5724 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5725 ) 5726 if annotation_fields_new_name in self.get_header().infos: 5727 log.warning( 5728 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5729 ) 5730 5731 # Check if ALL fields have to be annotated. Thus concat all INFO field 5732 # allow_annotation_full_info = True 5733 allow_annotation_full_info = not force_append_annotation 5734 5735 if parquet_type in ["regions"]: 5736 allow_annotation_full_info = False 5737 5738 if ( 5739 allow_annotation_full_info 5740 and nb_annotation_field == len(annotation_fields) 5741 and annotation_fields_all 5742 and ( 5743 "INFO" in parquet_hdr_vcf_header_columns 5744 and "INFO" in database.get_extra_columns() 5745 ) 5746 ): 5747 log.debug("Column INFO annotation enabled") 5748 sql_query_annotation_update_info_sets = [] 5749 sql_query_annotation_update_info_sets.append( 5750 f" table_parquet.INFO " 5751 ) 5752 5753 if sql_query_annotation_update_info_sets: 5754 5755 # Annotate 5756 log.info(f"Annotation '{annotation_name}' - Annotation...") 5757 5758 # Join query annotation update info sets for SQL 5759 sql_query_annotation_update_info_sets_sql = ",".join( 5760 sql_query_annotation_update_info_sets 5761 ) 5762 5763 # Check chromosomes list (and variants infos) 5764 sql_query_chromosomes = f""" 5765 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5766 FROM {table_variants} as table_variants 5767 GROUP BY table_variants."#CHROM" 5768 ORDER BY table_variants."#CHROM" 5769 """ 5770 sql_query_chromosomes_df = self.conn.execute( 5771 sql_query_chromosomes 5772 ).df() 5773 sql_query_chromosomes_dict = { 5774 entry["CHROM"]: { 5775 "count": entry["count_variants"], 5776 "min": entry["min_variants"], 5777 "max": entry["max_variants"], 5778 } 5779 for index, entry in sql_query_chromosomes_df.iterrows() 5780 } 5781 5782 # Init 5783 nb_of_query = 0 5784 nb_of_variant_annotated = 0 5785 query_dict = query_dict_remove 5786 5787 # for chrom in sql_query_chromosomes_df["CHROM"]: 5788 for chrom in sql_query_chromosomes_dict: 5789 5790 # Number of variant by chromosome 5791 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5792 chrom, {} 5793 ).get("count", 0) 5794 5795 log.debug( 5796 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5797 ) 5798 5799 # Annotation with regions database 5800 if parquet_type in ["regions"]: 5801 sql_query_annotation_from_clause = f""" 5802 FROM ( 5803 SELECT 5804 '{chrom}' AS \"#CHROM\", 5805 table_variants_from.\"POS\" AS \"POS\", 5806 {",".join(sql_query_annotation_to_agregate)} 5807 FROM {table_variants} as table_variants_from 5808 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5809 table_parquet_from."#CHROM" = '{chrom}' 5810 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5811 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5812 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5813 ) 5814 ) 5815 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5816 GROUP BY table_variants_from.\"POS\" 5817 ) 5818 as table_parquet 5819 """ 5820 5821 sql_query_annotation_where_clause = """ 5822 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5823 AND table_parquet.\"POS\" = table_variants.\"POS\" 5824 """ 5825 5826 # Annotation with variants database 5827 else: 5828 sql_query_annotation_from_clause = f""" 5829 FROM {parquet_file_link} as table_parquet 5830 """ 5831 sql_query_annotation_where_clause = f""" 5832 table_variants."#CHROM" = '{chrom}' 5833 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5834 AND table_parquet.\"POS\" = table_variants.\"POS\" 5835 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5836 AND table_parquet.\"REF\" = table_variants.\"REF\" 5837 """ 5838 5839 # Create update query 5840 sql_query_annotation_chrom_interval_pos = f""" 5841 UPDATE {table_variants} as table_variants 5842 SET INFO = 5843 concat( 5844 CASE WHEN table_variants.INFO NOT IN ('','.') 5845 THEN table_variants.INFO 5846 ELSE '' 5847 END 5848 , 5849 CASE WHEN table_variants.INFO NOT IN ('','.') 5850 AND ( 5851 concat({sql_query_annotation_update_info_sets_sql}) 5852 ) 5853 NOT IN ('','.') 5854 THEN ';' 5855 ELSE '' 5856 END 5857 , 5858 {sql_query_annotation_update_info_sets_sql} 5859 ) 5860 {sql_query_annotation_from_clause} 5861 WHERE {sql_query_annotation_where_clause} 5862 ; 5863 """ 5864 5865 # Add update query to dict 5866 query_dict[ 5867 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5868 ] = sql_query_annotation_chrom_interval_pos 5869 5870 nb_of_query = len(query_dict) 5871 num_query = 0 5872 5873 # SET max_expression_depth TO x 5874 self.conn.execute("SET max_expression_depth TO 10000") 5875 5876 for query_name in query_dict: 5877 query = query_dict[query_name] 5878 num_query += 1 5879 log.info( 5880 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5881 ) 5882 result = self.conn.execute(query) 5883 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5884 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5885 log.info( 5886 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5887 ) 5888 5889 log.info( 5890 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5891 ) 5892 5893 else: 5894 5895 log.info( 5896 f"Annotation '{annotation_name}' - No Annotations available" 5897 ) 5898 5899 log.debug("Final header: " + str(vcf_reader.infos)) 5900 5901 # Remove added columns 5902 for added_column in added_columns: 5903 self.drop_column(column=added_column) 5904 5905 def annotation_splice(self, threads: int = None) -> None: 5906 """ 5907 This function annotate with snpEff 5908 5909 :param threads: The number of threads to use 5910 :return: the value of the variable "return_value". 5911 """ 5912 5913 # DEBUG 5914 log.debug("Start annotation with splice tools") 5915 5916 # Threads 5917 if not threads: 5918 threads = self.get_threads() 5919 log.debug("Threads: " + str(threads)) 5920 5921 # DEBUG 5922 delete_tmp = True 5923 if self.get_config().get("verbosity", "warning") in ["debug"]: 5924 delete_tmp = False 5925 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5926 5927 # Config 5928 config = self.get_config() 5929 log.debug("Config: " + str(config)) 5930 splice_config = config.get("tools", {}).get("splice", {}) 5931 if not splice_config: 5932 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5933 if not splice_config: 5934 msg_err = "No Splice tool config" 5935 log.error(msg_err) 5936 raise ValueError(msg_err) 5937 log.debug(f"splice_config={splice_config}") 5938 5939 # Config - Folders - Databases 5940 databases_folders = ( 5941 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5942 ) 5943 log.debug("Databases annotations: " + str(databases_folders)) 5944 5945 # Splice docker image 5946 splice_docker_image = splice_config.get("docker").get("image") 5947 5948 # Pull splice image if it's not already there 5949 if not check_docker_image_exists(splice_docker_image): 5950 log.warning( 5951 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5952 ) 5953 try: 5954 command(f"docker pull {splice_config.get('docker').get('image')}") 5955 except subprocess.CalledProcessError: 5956 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5957 log.error(msg_err) 5958 raise ValueError(msg_err) 5959 return None 5960 5961 # Config - splice databases 5962 splice_databases = ( 5963 config.get("folders", {}) 5964 .get("databases", {}) 5965 .get("splice", DEFAULT_SPLICE_FOLDER) 5966 ) 5967 splice_databases = full_path(splice_databases) 5968 5969 # Param 5970 param = self.get_param() 5971 log.debug("Param: " + str(param)) 5972 5973 # Param 5974 options = param.get("annotation", {}).get("splice", {}) 5975 log.debug("Options: " + str(options)) 5976 5977 # Data 5978 table_variants = self.get_table_variants() 5979 5980 # Check if not empty 5981 log.debug("Check if not empty") 5982 sql_query_chromosomes = ( 5983 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5984 ) 5985 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5986 log.info("VCF empty") 5987 return None 5988 5989 # Export in VCF 5990 log.debug("Create initial file to annotate") 5991 5992 # Create output folder 5993 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5994 if not os.path.exists(output_folder): 5995 Path(output_folder).mkdir(parents=True, exist_ok=True) 5996 5997 # Create tmp VCF file 5998 tmp_vcf = NamedTemporaryFile( 5999 prefix=self.get_prefix(), 6000 dir=output_folder, 6001 suffix=".vcf", 6002 delete=False, 6003 ) 6004 tmp_vcf_name = tmp_vcf.name 6005 6006 # VCF header 6007 header = self.get_header() 6008 6009 # Existing annotations 6010 for vcf_annotation in self.get_header().infos: 6011 6012 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6013 log.debug( 6014 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6015 ) 6016 6017 # Memory limit 6018 if config.get("memory", None): 6019 memory_limit = config.get("memory", "8G").upper() 6020 # upper() 6021 else: 6022 memory_limit = "8G" 6023 log.debug(f"memory_limit: {memory_limit}") 6024 6025 # Check number of variants to annotate 6026 where_clause_regex_spliceai = r"SpliceAI_\w+" 6027 where_clause_regex_spip = r"SPiP_\w+" 6028 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6029 df_list_of_variants_to_annotate = self.get_query_to_df( 6030 query=f""" SELECT * FROM variants {where_clause} """ 6031 ) 6032 if len(df_list_of_variants_to_annotate) == 0: 6033 log.warning( 6034 f"No variants to annotate with splice. Variants probably already annotated with splice" 6035 ) 6036 return None 6037 else: 6038 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6039 6040 # Export VCF file 6041 self.export_variant_vcf( 6042 vcf_file=tmp_vcf_name, 6043 remove_info=True, 6044 add_samples=True, 6045 index=False, 6046 where_clause=where_clause, 6047 ) 6048 6049 # Create docker container and launch splice analysis 6050 if splice_config: 6051 6052 # Splice mount folders 6053 mount_folders = splice_config.get("mount", {}) 6054 6055 # Genome mount 6056 mount_folders[ 6057 config.get("folders", {}) 6058 .get("databases", {}) 6059 .get("genomes", DEFAULT_GENOME_FOLDER) 6060 ] = "ro" 6061 6062 # SpliceAI mount 6063 mount_folders[ 6064 config.get("folders", {}) 6065 .get("databases", {}) 6066 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6067 ] = "ro" 6068 6069 # Genome mount 6070 mount_folders[ 6071 config.get("folders", {}) 6072 .get("databases", {}) 6073 .get("spip", DEFAULT_SPIP_FOLDER) 6074 ] = "ro" 6075 6076 # Mount folders 6077 mount = [] 6078 6079 # Config mount 6080 mount = [ 6081 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6082 for path, mode in mount_folders.items() 6083 ] 6084 6085 if any(value for value in splice_config.values() if value is None): 6086 log.warning("At least one splice config parameter is empty") 6087 return None 6088 6089 # Params in splice nf 6090 def check_values(dico: dict): 6091 """ 6092 Ensure parameters for NF splice pipeline 6093 """ 6094 for key, val in dico.items(): 6095 if key == "genome": 6096 if any( 6097 assemb in options.get("genome", {}) 6098 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6099 ): 6100 yield f"--{key} hg19" 6101 elif any( 6102 assemb in options.get("genome", {}) 6103 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6104 ): 6105 yield f"--{key} hg38" 6106 elif ( 6107 (isinstance(val, str) and val) 6108 or isinstance(val, int) 6109 or isinstance(val, bool) 6110 ): 6111 yield f"--{key} {val}" 6112 6113 # Genome 6114 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6115 options["genome"] = genome 6116 6117 # NF params 6118 nf_params = [] 6119 6120 # Add options 6121 if options: 6122 nf_params = list(check_values(options)) 6123 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6124 else: 6125 log.debug("No NF params provided") 6126 6127 # Add threads 6128 if "threads" not in options.keys(): 6129 nf_params.append(f"--threads {threads}") 6130 6131 # Genome path 6132 genome_path = find_genome( 6133 config.get("folders", {}) 6134 .get("databases", {}) 6135 .get("genomes", DEFAULT_GENOME_FOLDER), 6136 file=f"{genome}.fa", 6137 ) 6138 # Add genome path 6139 if not genome_path: 6140 raise ValueError( 6141 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6142 ) 6143 else: 6144 log.debug(f"Genome: {genome_path}") 6145 nf_params.append(f"--genome_path {genome_path}") 6146 6147 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6148 """ 6149 Setting up updated databases for SPiP and SpliceAI 6150 """ 6151 6152 try: 6153 6154 # SpliceAI assembly transcriptome 6155 spliceai_assembly = os.path.join( 6156 config.get("folders", {}) 6157 .get("databases", {}) 6158 .get("spliceai", {}), 6159 options.get("genome"), 6160 "transcriptome", 6161 ) 6162 spip_assembly = options.get("genome") 6163 6164 spip = find( 6165 f"transcriptome_{spip_assembly}.RData", 6166 config.get("folders", {}).get("databases", {}).get("spip", {}), 6167 ) 6168 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6169 log.debug(f"SPiP annotations: {spip}") 6170 log.debug(f"SpliceAI annotations: {spliceai}") 6171 if spip and spliceai: 6172 return [ 6173 f"--spip_transcriptome {spip}", 6174 f"--spliceai_annotations {spliceai}", 6175 ] 6176 else: 6177 # TODO crash and go on with basic annotations ? 6178 # raise ValueError( 6179 # "Can't find splice databases in configuration EXIT" 6180 # ) 6181 log.warning( 6182 "Can't find splice databases in configuration, use annotations file from image" 6183 ) 6184 except TypeError: 6185 log.warning( 6186 "Can't find splice databases in configuration, use annotations file from image" 6187 ) 6188 return [] 6189 6190 # Add options, check if transcriptome option have already beend provided 6191 if ( 6192 "spip_transcriptome" not in nf_params 6193 and "spliceai_transcriptome" not in nf_params 6194 ): 6195 splice_reference = splice_annotations(options, config) 6196 if splice_reference: 6197 nf_params.extend(splice_reference) 6198 6199 nf_params.append(f"--output_folder {output_folder}") 6200 6201 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6202 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6203 log.debug(cmd) 6204 6205 splice_config["docker"]["command"] = cmd 6206 6207 docker_cmd = get_bin_command( 6208 tool="splice", 6209 bin_type="docker", 6210 config=config, 6211 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6212 add_options=f"--name {random_uuid} {' '.join(mount)}", 6213 ) 6214 6215 # Docker debug 6216 # if splice_config.get("rm_container"): 6217 # rm_container = "--rm" 6218 # else: 6219 # rm_container = "" 6220 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6221 6222 log.debug(docker_cmd) 6223 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6224 log.debug(res.stdout) 6225 if res.stderr: 6226 log.error(res.stderr) 6227 res.check_returncode() 6228 else: 6229 log.warning(f"Splice tool configuration not found: {config}") 6230 6231 # Update variants 6232 log.info("Annotation - Updating...") 6233 # Test find output vcf 6234 log.debug( 6235 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6236 ) 6237 output_vcf = [] 6238 # Wrong folder to look in 6239 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6240 if ( 6241 files 6242 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6243 ): 6244 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6245 # log.debug(os.listdir(options.get("output_folder"))) 6246 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6247 if not output_vcf: 6248 log.debug( 6249 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6250 ) 6251 else: 6252 # Get new header from annotated vcf 6253 log.debug(f"Initial header: {len(header.infos)} fields") 6254 # Create new header with splice infos 6255 new_vcf = Variants(input=output_vcf[0]) 6256 new_vcf_header = new_vcf.get_header().infos 6257 for keys, infos in new_vcf_header.items(): 6258 if keys not in header.infos.keys(): 6259 header.infos[keys] = infos 6260 log.debug(f"New header: {len(header.infos)} fields") 6261 log.debug(f"Splice tmp output: {output_vcf[0]}") 6262 self.update_from_vcf(output_vcf[0]) 6263 6264 # Remove folder 6265 remove_if_exists(output_folder) 6266 6267 ### 6268 # Prioritization 6269 ### 6270 6271 def get_config_default(self, name: str) -> dict: 6272 """ 6273 The function `get_config_default` returns a dictionary containing default configurations for 6274 various calculations and prioritizations. 6275 6276 :param name: The `get_config_default` function returns a dictionary containing default 6277 configurations for different calculations and prioritizations. The `name` parameter is used to 6278 specify which specific configuration to retrieve from the dictionary 6279 :type name: str 6280 :return: The function `get_config_default` returns a dictionary containing default configuration 6281 settings for different calculations and prioritizations. The specific configuration settings are 6282 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6283 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6284 returned. If there is no match, an empty dictionary is returned. 6285 """ 6286 6287 config_default = { 6288 "calculations": { 6289 "variant_chr_pos_alt_ref": { 6290 "type": "sql", 6291 "name": "variant_chr_pos_alt_ref", 6292 "description": "Create a variant ID with chromosome, position, alt and ref", 6293 "available": False, 6294 "output_column_name": "variant_chr_pos_alt_ref", 6295 "output_column_type": "String", 6296 "output_column_description": "variant ID with chromosome, position, alt and ref", 6297 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6298 "operation_info": True, 6299 }, 6300 "VARTYPE": { 6301 "type": "sql", 6302 "name": "VARTYPE", 6303 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6304 "available": True, 6305 "output_column_name": "VARTYPE", 6306 "output_column_type": "String", 6307 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6308 "operation_query": """ 6309 CASE 6310 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6311 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6312 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6313 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6314 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6315 ELSE 'UNDEFINED' 6316 END 6317 """, 6318 "info_fields": ["SVTYPE"], 6319 "operation_info": True, 6320 }, 6321 "snpeff_hgvs": { 6322 "type": "python", 6323 "name": "snpeff_hgvs", 6324 "description": "HGVS nomenclatures from snpEff annotation", 6325 "available": True, 6326 "function_name": "calculation_extract_snpeff_hgvs", 6327 "function_params": ["snpeff_hgvs", "ANN"], 6328 }, 6329 "snpeff_ann_explode": { 6330 "type": "python", 6331 "name": "snpeff_ann_explode", 6332 "description": "Explode snpEff annotations with uniquify values", 6333 "available": True, 6334 "function_name": "calculation_snpeff_ann_explode", 6335 "function_params": [False, "fields", "snpeff_", "ANN"], 6336 }, 6337 "snpeff_ann_explode_uniquify": { 6338 "type": "python", 6339 "name": "snpeff_ann_explode_uniquify", 6340 "description": "Explode snpEff annotations", 6341 "available": True, 6342 "function_name": "calculation_snpeff_ann_explode", 6343 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6344 }, 6345 "snpeff_ann_explode_json": { 6346 "type": "python", 6347 "name": "snpeff_ann_explode_json", 6348 "description": "Explode snpEff annotations in JSON format", 6349 "available": True, 6350 "function_name": "calculation_snpeff_ann_explode", 6351 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6352 }, 6353 "NOMEN": { 6354 "type": "python", 6355 "name": "NOMEN", 6356 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6357 "available": True, 6358 "function_name": "calculation_extract_nomen", 6359 "function_params": [], 6360 }, 6361 "FINDBYPIPELINE": { 6362 "type": "python", 6363 "name": "FINDBYPIPELINE", 6364 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6365 "available": True, 6366 "function_name": "calculation_find_by_pipeline", 6367 "function_params": ["findbypipeline"], 6368 }, 6369 "FINDBYSAMPLE": { 6370 "type": "python", 6371 "name": "FINDBYSAMPLE", 6372 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6373 "available": True, 6374 "function_name": "calculation_find_by_pipeline", 6375 "function_params": ["findbysample"], 6376 }, 6377 "GENOTYPECONCORDANCE": { 6378 "type": "python", 6379 "name": "GENOTYPECONCORDANCE", 6380 "description": "Concordance of genotype for multi caller VCF", 6381 "available": True, 6382 "function_name": "calculation_genotype_concordance", 6383 "function_params": [], 6384 }, 6385 "BARCODE": { 6386 "type": "python", 6387 "name": "BARCODE", 6388 "description": "BARCODE as VaRank tool", 6389 "available": True, 6390 "function_name": "calculation_barcode", 6391 "function_params": [], 6392 }, 6393 "BARCODEFAMILY": { 6394 "type": "python", 6395 "name": "BARCODEFAMILY", 6396 "description": "BARCODEFAMILY as VaRank tool", 6397 "available": True, 6398 "function_name": "calculation_barcode_family", 6399 "function_params": ["BCF"], 6400 }, 6401 "TRIO": { 6402 "type": "python", 6403 "name": "TRIO", 6404 "description": "Inheritance for a trio family", 6405 "available": True, 6406 "function_name": "calculation_trio", 6407 "function_params": [], 6408 }, 6409 "VAF": { 6410 "type": "python", 6411 "name": "VAF", 6412 "description": "Variant Allele Frequency (VAF) harmonization", 6413 "available": True, 6414 "function_name": "calculation_vaf_normalization", 6415 "function_params": [], 6416 }, 6417 "VAF_stats": { 6418 "type": "python", 6419 "name": "VAF_stats", 6420 "description": "Variant Allele Frequency (VAF) statistics", 6421 "available": True, 6422 "function_name": "calculation_genotype_stats", 6423 "function_params": ["VAF"], 6424 }, 6425 "DP_stats": { 6426 "type": "python", 6427 "name": "DP_stats", 6428 "description": "Depth (DP) statistics", 6429 "available": True, 6430 "function_name": "calculation_genotype_stats", 6431 "function_params": ["DP"], 6432 }, 6433 "variant_id": { 6434 "type": "python", 6435 "name": "variant_id", 6436 "description": "Variant ID generated from variant position and type", 6437 "available": True, 6438 "function_name": "calculation_variant_id", 6439 "function_params": [], 6440 }, 6441 }, 6442 "prioritizations": { 6443 "default": { 6444 "filter": [ 6445 { 6446 "type": "notequals", 6447 "value": "!PASS|\\.", 6448 "score": 0, 6449 "flag": "FILTERED", 6450 "comment": ["Bad variant quality"], 6451 }, 6452 { 6453 "type": "equals", 6454 "value": "REJECT", 6455 "score": -20, 6456 "flag": "PASS", 6457 "comment": ["Bad variant quality"], 6458 }, 6459 ], 6460 "DP": [ 6461 { 6462 "type": "gte", 6463 "value": "50", 6464 "score": 5, 6465 "flag": "PASS", 6466 "comment": ["DP higher than 50"], 6467 } 6468 ], 6469 "ANN": [ 6470 { 6471 "type": "contains", 6472 "value": "HIGH", 6473 "score": 5, 6474 "flag": "PASS", 6475 "comment": [ 6476 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6477 ], 6478 }, 6479 { 6480 "type": "contains", 6481 "value": "MODERATE", 6482 "score": 3, 6483 "flag": "PASS", 6484 "comment": [ 6485 "A non-disruptive variant that might change protein effectiveness" 6486 ], 6487 }, 6488 { 6489 "type": "contains", 6490 "value": "LOW", 6491 "score": 0, 6492 "flag": "FILTERED", 6493 "comment": [ 6494 "Assumed to be mostly harmless or unlikely to change protein behavior" 6495 ], 6496 }, 6497 { 6498 "type": "contains", 6499 "value": "MODIFIER", 6500 "score": 0, 6501 "flag": "FILTERED", 6502 "comment": [ 6503 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6504 ], 6505 }, 6506 ], 6507 } 6508 }, 6509 } 6510 6511 return config_default.get(name, None) 6512 6513 def get_config_json( 6514 self, name: str, config_dict: dict = {}, config_file: str = None 6515 ) -> dict: 6516 """ 6517 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6518 default values, a dictionary, and a file. 6519 6520 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6521 the name of the configuration. It is used to identify and retrieve the configuration settings 6522 for a specific component or module 6523 :type name: str 6524 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6525 dictionary that allows you to provide additional configuration settings or overrides. When you 6526 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6527 the key is the configuration setting you want to override or 6528 :type config_dict: dict 6529 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6530 specify the path to a configuration file that contains additional settings. If provided, the 6531 function will read the contents of this file and update the configuration dictionary with the 6532 values found in the file, overriding any existing values with the 6533 :type config_file: str 6534 :return: The function `get_config_json` returns a dictionary containing the configuration 6535 settings. 6536 """ 6537 6538 # Create with default prioritizations 6539 config_default = self.get_config_default(name=name) 6540 configuration = config_default 6541 # log.debug(f"configuration={configuration}") 6542 6543 # Replace prioritizations from dict 6544 for config in config_dict: 6545 configuration[config] = config_dict[config] 6546 6547 # Replace prioritizations from file 6548 config_file = full_path(config_file) 6549 if config_file: 6550 if os.path.exists(config_file): 6551 with open(config_file) as config_file_content: 6552 config_file_dict = json.load(config_file_content) 6553 for config in config_file_dict: 6554 configuration[config] = config_file_dict[config] 6555 else: 6556 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6557 log.error(msg_error) 6558 raise ValueError(msg_error) 6559 6560 return configuration 6561 6562 def prioritization(self) -> None: 6563 """ 6564 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6565 INFO fields 6566 """ 6567 6568 # Config 6569 config = self.get_config() 6570 6571 # Param 6572 param = self.get_param() 6573 6574 # Quick Prioritizations 6575 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6576 6577 # Configuration profiles 6578 prioritization_config_file = param.get("prioritization", {}).get( 6579 "prioritization_config", None 6580 ) 6581 prioritization_config_file = full_path(prioritization_config_file) 6582 prioritizations_config = self.get_config_json( 6583 name="prioritizations", config_file=prioritization_config_file 6584 ) 6585 6586 # Prioritization options 6587 profiles = param.get("prioritization", {}).get("profiles", []) 6588 if isinstance(profiles, str): 6589 profiles = profiles.split(",") 6590 pzfields = param.get("prioritization", {}).get( 6591 "pzfields", ["PZFlag", "PZScore"] 6592 ) 6593 if isinstance(pzfields, str): 6594 pzfields = pzfields.split(",") 6595 default_profile = param.get("prioritization", {}).get("default_profile", None) 6596 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6597 prioritization_score_mode = param.get("prioritization", {}).get( 6598 "prioritization_score_mode", "HOWARD" 6599 ) 6600 6601 # Quick Prioritizations 6602 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6603 prioritizations = param.get("prioritizations", None) 6604 if prioritizations: 6605 log.info("Quick Prioritization:") 6606 for profile in prioritizations.split(","): 6607 if profile not in profiles: 6608 profiles.append(profile) 6609 log.info(f" {profile}") 6610 6611 # If profile "ALL" provided, all profiles in the config profiles 6612 if "ALL" in profiles: 6613 profiles = list(prioritizations_config.keys()) 6614 6615 for profile in profiles: 6616 if prioritizations_config.get(profile, None): 6617 log.debug(f"Profile '{profile}' configured") 6618 else: 6619 msg_error = f"Profile '{profile}' NOT configured" 6620 log.error(msg_error) 6621 raise ValueError(msg_error) 6622 6623 if profiles: 6624 log.info(f"Prioritization... ") 6625 else: 6626 log.debug(f"No profile defined") 6627 return 6628 6629 if not default_profile and len(profiles): 6630 default_profile = profiles[0] 6631 6632 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6633 log.debug("Profiles to check: " + str(list(profiles))) 6634 6635 # Variables 6636 table_variants = self.get_table_variants(clause="update") 6637 6638 # Added columns 6639 added_columns = [] 6640 6641 # Create list of PZfields 6642 # List of PZFields 6643 list_of_pzfields_original = pzfields + [ 6644 pzfield + pzfields_sep + profile 6645 for pzfield in pzfields 6646 for profile in profiles 6647 ] 6648 list_of_pzfields = [] 6649 log.debug(f"{list_of_pzfields_original}") 6650 6651 # Remove existing PZfields to use if exists 6652 for pzfield in list_of_pzfields_original: 6653 if self.get_header().infos.get(pzfield, None) is None: 6654 list_of_pzfields.append(pzfield) 6655 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6656 else: 6657 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6658 6659 if list_of_pzfields: 6660 6661 # Explode Infos fields 6662 explode_infos_prefix = self.get_explode_infos_prefix() 6663 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6664 extra_infos = self.get_extra_infos() 6665 6666 # PZfields tags description 6667 PZfields_INFOS = { 6668 "PZTags": { 6669 "ID": "PZTags", 6670 "Number": ".", 6671 "Type": "String", 6672 "Description": "Variant tags based on annotation criteria", 6673 }, 6674 "PZScore": { 6675 "ID": "PZScore", 6676 "Number": 1, 6677 "Type": "Integer", 6678 "Description": "Variant score based on annotation criteria", 6679 }, 6680 "PZFlag": { 6681 "ID": "PZFlag", 6682 "Number": 1, 6683 "Type": "String", 6684 "Description": "Variant flag based on annotation criteria", 6685 }, 6686 "PZComment": { 6687 "ID": "PZComment", 6688 "Number": ".", 6689 "Type": "String", 6690 "Description": "Variant comment based on annotation criteria", 6691 }, 6692 "PZInfos": { 6693 "ID": "PZInfos", 6694 "Number": ".", 6695 "Type": "String", 6696 "Description": "Variant infos based on annotation criteria", 6697 }, 6698 } 6699 6700 # Create INFO fields if not exist 6701 for field in PZfields_INFOS: 6702 field_ID = PZfields_INFOS[field]["ID"] 6703 field_description = PZfields_INFOS[field]["Description"] 6704 if field_ID not in self.get_header().infos and field_ID in pzfields: 6705 field_description = ( 6706 PZfields_INFOS[field]["Description"] 6707 + f", profile {default_profile}" 6708 ) 6709 self.get_header().infos[field_ID] = vcf.parser._Info( 6710 field_ID, 6711 PZfields_INFOS[field]["Number"], 6712 PZfields_INFOS[field]["Type"], 6713 field_description, 6714 "unknown", 6715 "unknown", 6716 code_type_map[PZfields_INFOS[field]["Type"]], 6717 ) 6718 6719 # Create INFO fields if not exist for each profile 6720 for profile in prioritizations_config: 6721 if profile in profiles or profiles == []: 6722 for field in PZfields_INFOS: 6723 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6724 field_description = ( 6725 PZfields_INFOS[field]["Description"] 6726 + f", profile {profile}" 6727 ) 6728 if ( 6729 field_ID not in self.get_header().infos 6730 and field in pzfields 6731 ): 6732 self.get_header().infos[field_ID] = vcf.parser._Info( 6733 field_ID, 6734 PZfields_INFOS[field]["Number"], 6735 PZfields_INFOS[field]["Type"], 6736 field_description, 6737 "unknown", 6738 "unknown", 6739 code_type_map[PZfields_INFOS[field]["Type"]], 6740 ) 6741 6742 # Header 6743 for pzfield in list_of_pzfields: 6744 if re.match("PZScore.*", pzfield): 6745 added_column = self.add_column( 6746 table_name=table_variants, 6747 column_name=pzfield, 6748 column_type="INTEGER", 6749 default_value="0", 6750 ) 6751 elif re.match("PZFlag.*", pzfield): 6752 added_column = self.add_column( 6753 table_name=table_variants, 6754 column_name=pzfield, 6755 column_type="BOOLEAN", 6756 default_value="1", 6757 ) 6758 else: 6759 added_column = self.add_column( 6760 table_name=table_variants, 6761 column_name=pzfield, 6762 column_type="STRING", 6763 default_value="''", 6764 ) 6765 added_columns.append(added_column) 6766 6767 # Profiles 6768 if profiles: 6769 6770 # foreach profile in configuration file 6771 for profile in prioritizations_config: 6772 6773 # If profile is asked in param, or ALL are asked (empty profile []) 6774 if profile in profiles or profiles == []: 6775 log.info(f"Profile '{profile}'") 6776 6777 sql_set_info_option = "" 6778 6779 sql_set_info = [] 6780 6781 # PZ fields set 6782 6783 # PZScore 6784 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6785 sql_set_info.append( 6786 f""" 6787 concat( 6788 'PZScore{pzfields_sep}{profile}=', 6789 PZScore{pzfields_sep}{profile} 6790 ) 6791 """ 6792 ) 6793 if ( 6794 profile == default_profile 6795 and "PZScore" in list_of_pzfields 6796 ): 6797 sql_set_info.append( 6798 f""" 6799 concat( 6800 'PZScore=', 6801 PZScore{pzfields_sep}{profile} 6802 ) 6803 """ 6804 ) 6805 6806 # PZFlag 6807 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6808 sql_set_info.append( 6809 f""" 6810 concat( 6811 'PZFlag{pzfields_sep}{profile}=', 6812 CASE 6813 WHEN PZFlag{pzfields_sep}{profile}==1 6814 THEN 'PASS' 6815 WHEN PZFlag{pzfields_sep}{profile}==0 6816 THEN 'FILTERED' 6817 END 6818 ) 6819 """ 6820 ) 6821 if ( 6822 profile == default_profile 6823 and "PZFlag" in list_of_pzfields 6824 ): 6825 sql_set_info.append( 6826 f""" 6827 concat( 6828 'PZFlag=', 6829 CASE 6830 WHEN PZFlag{pzfields_sep}{profile}==1 6831 THEN 'PASS' 6832 WHEN PZFlag{pzfields_sep}{profile}==0 6833 THEN 'FILTERED' 6834 END 6835 ) 6836 """ 6837 ) 6838 6839 # PZComment 6840 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6841 sql_set_info.append( 6842 f""" 6843 CASE 6844 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6845 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6846 ELSE '' 6847 END 6848 """ 6849 ) 6850 if ( 6851 profile == default_profile 6852 and "PZComment" in list_of_pzfields 6853 ): 6854 sql_set_info.append( 6855 f""" 6856 CASE 6857 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6858 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6859 ELSE '' 6860 END 6861 """ 6862 ) 6863 6864 # PZInfos 6865 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6866 sql_set_info.append( 6867 f""" 6868 CASE 6869 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6870 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6871 ELSE '' 6872 END 6873 """ 6874 ) 6875 if ( 6876 profile == default_profile 6877 and "PZInfos" in list_of_pzfields 6878 ): 6879 sql_set_info.append( 6880 f""" 6881 CASE 6882 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6883 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6884 ELSE '' 6885 END 6886 """ 6887 ) 6888 6889 # Merge PZfields 6890 sql_set_info_option = "" 6891 sql_set_sep = "" 6892 for sql_set in sql_set_info: 6893 if sql_set_sep: 6894 sql_set_info_option += f""" 6895 , concat('{sql_set_sep}', {sql_set}) 6896 """ 6897 else: 6898 sql_set_info_option += f""" 6899 , {sql_set} 6900 """ 6901 sql_set_sep = ";" 6902 6903 sql_queries = [] 6904 for annotation in prioritizations_config[profile]: 6905 6906 # Check if annotation field is present 6907 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6908 log.debug(f"Annotation '{annotation}' not in data") 6909 continue 6910 else: 6911 log.debug(f"Annotation '{annotation}' in data") 6912 6913 # For each criterions 6914 for criterion in prioritizations_config[profile][ 6915 annotation 6916 ]: 6917 criterion_type = criterion["type"] 6918 criterion_value = criterion["value"] 6919 criterion_score = criterion.get("score", 0) 6920 criterion_flag = criterion.get("flag", "PASS") 6921 criterion_flag_bool = criterion_flag == "PASS" 6922 criterion_comment = ( 6923 ", ".join(criterion.get("comment", [])) 6924 .replace("'", "''") 6925 .replace(";", ",") 6926 .replace("\t", " ") 6927 ) 6928 criterion_infos = ( 6929 str(criterion) 6930 .replace("'", "''") 6931 .replace(";", ",") 6932 .replace("\t", " ") 6933 ) 6934 6935 sql_set = [] 6936 sql_set_info = [] 6937 6938 # PZ fields set 6939 if ( 6940 f"PZScore{pzfields_sep}{profile}" 6941 in list_of_pzfields 6942 ): 6943 if prioritization_score_mode == "HOWARD": 6944 sql_set.append( 6945 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6946 ) 6947 elif prioritization_score_mode == "VaRank": 6948 sql_set.append( 6949 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6950 ) 6951 else: 6952 sql_set.append( 6953 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6954 ) 6955 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6956 sql_set.append( 6957 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6958 ) 6959 if ( 6960 f"PZComment{pzfields_sep}{profile}" 6961 in list_of_pzfields 6962 ): 6963 sql_set.append( 6964 f""" 6965 PZComment{pzfields_sep}{profile} = 6966 concat( 6967 PZComment{pzfields_sep}{profile}, 6968 CASE 6969 WHEN PZComment{pzfields_sep}{profile}!='' 6970 THEN ', ' 6971 ELSE '' 6972 END, 6973 '{criterion_comment}' 6974 ) 6975 """ 6976 ) 6977 if ( 6978 f"PZInfos{pzfields_sep}{profile}" 6979 in list_of_pzfields 6980 ): 6981 sql_set.append( 6982 f""" 6983 PZInfos{pzfields_sep}{profile} = 6984 concat( 6985 PZInfos{pzfields_sep}{profile}, 6986 '{criterion_infos}' 6987 ) 6988 """ 6989 ) 6990 sql_set_option = ",".join(sql_set) 6991 6992 # Criterion and comparison 6993 try: 6994 float(criterion_value) 6995 sql_update = f""" 6996 UPDATE {table_variants} 6997 SET {sql_set_option} 6998 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 6999 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7000 """ 7001 except: 7002 contains_option = "" 7003 if criterion_type == "contains": 7004 contains_option = ".*" 7005 sql_update = f""" 7006 UPDATE {table_variants} 7007 SET {sql_set_option} 7008 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7009 """ 7010 sql_queries.append(sql_update) 7011 7012 # PZTags 7013 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7014 7015 # Create PZFalgs value 7016 pztags_value = "" 7017 pztags_sep_default = "|" 7018 pztags_sep = "" 7019 for pzfield in pzfields: 7020 if pzfield not in ["PZTags"]: 7021 if ( 7022 f"{pzfield}{pzfields_sep}{profile}" 7023 in list_of_pzfields 7024 ): 7025 if pzfield in ["PZFlag"]: 7026 pztags_value += f"""{pztags_sep}{pzfield}#', 7027 CASE WHEN PZFlag{pzfields_sep}{profile} 7028 THEN 'PASS' 7029 ELSE 'FILTERED' 7030 END, '""" 7031 else: 7032 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7033 pztags_sep = pztags_sep_default 7034 7035 # Add Query update for PZFlags 7036 sql_update_pztags = f""" 7037 UPDATE {table_variants} 7038 SET INFO = concat( 7039 INFO, 7040 CASE WHEN INFO NOT in ('','.') 7041 THEN ';' 7042 ELSE '' 7043 END, 7044 'PZTags{pzfields_sep}{profile}={pztags_value}' 7045 ) 7046 """ 7047 sql_queries.append(sql_update_pztags) 7048 7049 # Add Query update for PZFlags for default 7050 if profile == default_profile: 7051 sql_update_pztags_default = f""" 7052 UPDATE {table_variants} 7053 SET INFO = concat( 7054 INFO, 7055 ';', 7056 'PZTags={pztags_value}' 7057 ) 7058 """ 7059 sql_queries.append(sql_update_pztags_default) 7060 7061 log.info(f"""Profile '{profile}' - Prioritization... """) 7062 7063 if sql_queries: 7064 7065 for sql_query in sql_queries: 7066 log.debug( 7067 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7068 ) 7069 self.conn.execute(sql_query) 7070 7071 log.info(f"""Profile '{profile}' - Update... """) 7072 sql_query_update = f""" 7073 UPDATE {table_variants} 7074 SET INFO = 7075 concat( 7076 CASE 7077 WHEN INFO NOT IN ('','.') 7078 THEN concat(INFO, ';') 7079 ELSE '' 7080 END 7081 {sql_set_info_option} 7082 ) 7083 """ 7084 self.conn.execute(sql_query_update) 7085 7086 else: 7087 7088 log.warning(f"No profiles in parameters") 7089 7090 # Remove added columns 7091 for added_column in added_columns: 7092 self.drop_column(column=added_column) 7093 7094 # Explode INFOS fields into table fields 7095 if self.get_explode_infos(): 7096 self.explode_infos( 7097 prefix=self.get_explode_infos_prefix(), 7098 fields=self.get_explode_infos_fields(), 7099 force=True, 7100 ) 7101 7102 return 7103 7104 ### 7105 # HGVS 7106 ### 7107 7108 def annotation_hgvs(self, threads: int = None) -> None: 7109 """ 7110 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7111 coordinates and alleles. 7112 7113 :param threads: The `threads` parameter is an optional integer that specifies the number of 7114 threads to use for parallel processing. If no value is provided, it will default to the number 7115 of threads obtained from the `get_threads()` method 7116 :type threads: int 7117 """ 7118 7119 # Function for each partition of the Dask Dataframe 7120 def partition_function(partition): 7121 """ 7122 The function `partition_function` applies the `annotation_hgvs_partition` function to 7123 each row of a DataFrame called `partition`. 7124 7125 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7126 to be processed 7127 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7128 the "partition" dataframe along the axis 1. 7129 """ 7130 return partition.apply(annotation_hgvs_partition, axis=1) 7131 7132 def annotation_hgvs_partition(row) -> str: 7133 """ 7134 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7135 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7136 7137 :param row: A dictionary-like object that contains the values for the following keys: 7138 :return: a string that contains the HGVS names associated with the given row of data. 7139 """ 7140 7141 chr = row["CHROM"] 7142 pos = row["POS"] 7143 ref = row["REF"] 7144 alt = row["ALT"] 7145 7146 # Find list of associated transcripts 7147 transcripts_list = list( 7148 polars_conn.execute( 7149 f""" 7150 SELECT transcript 7151 FROM refseq_df 7152 WHERE CHROM='{chr}' 7153 AND POS={pos} 7154 """ 7155 )["transcript"] 7156 ) 7157 7158 # Full HGVS annotation in list 7159 hgvs_full_list = [] 7160 7161 for transcript_name in transcripts_list: 7162 7163 # Transcript 7164 transcript = get_transcript( 7165 transcripts=transcripts, transcript_name=transcript_name 7166 ) 7167 # Exon 7168 if use_exon: 7169 exon = transcript.find_exon_number(pos) 7170 else: 7171 exon = None 7172 # Protein 7173 transcript_protein = None 7174 if use_protein or add_protein or full_format: 7175 transcripts_protein = list( 7176 polars_conn.execute( 7177 f""" 7178 SELECT protein 7179 FROM refseqlink_df 7180 WHERE transcript='{transcript_name}' 7181 LIMIT 1 7182 """ 7183 )["protein"] 7184 ) 7185 if len(transcripts_protein): 7186 transcript_protein = transcripts_protein[0] 7187 7188 # HGVS name 7189 hgvs_name = format_hgvs_name( 7190 chr, 7191 pos, 7192 ref, 7193 alt, 7194 genome=genome, 7195 transcript=transcript, 7196 transcript_protein=transcript_protein, 7197 exon=exon, 7198 use_gene=use_gene, 7199 use_protein=use_protein, 7200 full_format=full_format, 7201 use_version=use_version, 7202 codon_type=codon_type, 7203 ) 7204 hgvs_full_list.append(hgvs_name) 7205 if add_protein and not use_protein and not full_format: 7206 hgvs_name = format_hgvs_name( 7207 chr, 7208 pos, 7209 ref, 7210 alt, 7211 genome=genome, 7212 transcript=transcript, 7213 transcript_protein=transcript_protein, 7214 exon=exon, 7215 use_gene=use_gene, 7216 use_protein=True, 7217 full_format=False, 7218 use_version=use_version, 7219 codon_type=codon_type, 7220 ) 7221 hgvs_full_list.append(hgvs_name) 7222 7223 # Create liste of HGVS annotations 7224 hgvs_full = ",".join(hgvs_full_list) 7225 7226 return hgvs_full 7227 7228 # Polars connexion 7229 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7230 7231 # Config 7232 config = self.get_config() 7233 7234 # Databases 7235 # Genome 7236 databases_genomes_folders = ( 7237 config.get("folders", {}) 7238 .get("databases", {}) 7239 .get("genomes", DEFAULT_GENOME_FOLDER) 7240 ) 7241 databases_genome = ( 7242 config.get("folders", {}).get("databases", {}).get("genomes", "") 7243 ) 7244 # refseq database folder 7245 databases_refseq_folders = ( 7246 config.get("folders", {}) 7247 .get("databases", {}) 7248 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7249 ) 7250 # refseq 7251 databases_refseq = config.get("databases", {}).get("refSeq", None) 7252 # refSeqLink 7253 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7254 7255 # Param 7256 param = self.get_param() 7257 7258 # Quick HGVS 7259 if "hgvs_options" in param and param.get("hgvs_options", ""): 7260 log.info(f"Quick HGVS Annotation:") 7261 if not param.get("hgvs", None): 7262 param["hgvs"] = {} 7263 for option in param.get("hgvs_options", "").split(","): 7264 option_var_val = option.split("=") 7265 option_var = option_var_val[0] 7266 if len(option_var_val) > 1: 7267 option_val = option_var_val[1] 7268 else: 7269 option_val = "True" 7270 if option_val.upper() in ["TRUE"]: 7271 option_val = True 7272 elif option_val.upper() in ["FALSE"]: 7273 option_val = False 7274 log.info(f" {option_var}={option_val}") 7275 param["hgvs"][option_var] = option_val 7276 7277 # Check if HGVS annotation enabled 7278 if "hgvs" in param: 7279 log.info(f"HGVS Annotation... ") 7280 for hgvs_option in param.get("hgvs", {}): 7281 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7282 else: 7283 return 7284 7285 # HGVS Param 7286 param_hgvs = param.get("hgvs", {}) 7287 use_exon = param_hgvs.get("use_exon", False) 7288 use_gene = param_hgvs.get("use_gene", False) 7289 use_protein = param_hgvs.get("use_protein", False) 7290 add_protein = param_hgvs.get("add_protein", False) 7291 full_format = param_hgvs.get("full_format", False) 7292 use_version = param_hgvs.get("use_version", False) 7293 codon_type = param_hgvs.get("codon_type", "3") 7294 7295 # refSseq refSeqLink 7296 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7297 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7298 7299 # Assembly 7300 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7301 7302 # Genome 7303 genome_file = None 7304 if find_genome(databases_genome): 7305 genome_file = find_genome(databases_genome) 7306 else: 7307 genome_file = find_genome( 7308 genome_path=databases_genomes_folders, assembly=assembly 7309 ) 7310 log.debug("Genome: " + str(genome_file)) 7311 7312 # refSseq 7313 refseq_file = find_file_prefix( 7314 input_file=databases_refseq, 7315 prefix="ncbiRefSeq", 7316 folder=databases_refseq_folders, 7317 assembly=assembly, 7318 ) 7319 log.debug("refSeq: " + str(refseq_file)) 7320 7321 # refSeqLink 7322 refseqlink_file = find_file_prefix( 7323 input_file=databases_refseqlink, 7324 prefix="ncbiRefSeqLink", 7325 folder=databases_refseq_folders, 7326 assembly=assembly, 7327 ) 7328 log.debug("refSeqLink: " + str(refseqlink_file)) 7329 7330 # Threads 7331 if not threads: 7332 threads = self.get_threads() 7333 log.debug("Threads: " + str(threads)) 7334 7335 # Variables 7336 table_variants = self.get_table_variants(clause="update") 7337 7338 # Get variants SNV and InDel only 7339 query_variants = f""" 7340 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7341 FROM {table_variants} 7342 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7343 """ 7344 df_variants = self.get_query_to_df(query_variants) 7345 7346 # Added columns 7347 added_columns = [] 7348 7349 # Add hgvs column in variants table 7350 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7351 added_column = self.add_column( 7352 table_variants, hgvs_column_name, "STRING", default_value=None 7353 ) 7354 added_columns.append(added_column) 7355 7356 log.debug(f"refSeq loading...") 7357 # refSeq in duckDB 7358 refseq_table = get_refseq_table( 7359 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7360 ) 7361 # Loading all refSeq in Dataframe 7362 refseq_query = f""" 7363 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7364 FROM {refseq_table} 7365 JOIN df_variants ON ( 7366 {refseq_table}.chrom = df_variants.CHROM 7367 AND {refseq_table}.txStart<=df_variants.POS 7368 AND {refseq_table}.txEnd>=df_variants.POS 7369 ) 7370 """ 7371 refseq_df = self.conn.query(refseq_query).pl() 7372 7373 if refseqlink_file: 7374 log.debug(f"refSeqLink loading...") 7375 # refSeqLink in duckDB 7376 refseqlink_table = get_refseq_table( 7377 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7378 ) 7379 # Loading all refSeqLink in Dataframe 7380 protacc_column = "protAcc_with_ver" 7381 mrnaacc_column = "mrnaAcc_with_ver" 7382 refseqlink_query = f""" 7383 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7384 FROM {refseqlink_table} 7385 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7386 WHERE protAcc_without_ver IS NOT NULL 7387 """ 7388 # Polars Dataframe 7389 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7390 7391 # Read RefSeq transcripts into a python dict/model. 7392 log.debug(f"Transcripts loading...") 7393 with tempfile.TemporaryDirectory() as tmpdir: 7394 transcripts_query = f""" 7395 COPY ( 7396 SELECT {refseq_table}.* 7397 FROM {refseq_table} 7398 JOIN df_variants ON ( 7399 {refseq_table}.chrom=df_variants.CHROM 7400 AND {refseq_table}.txStart<=df_variants.POS 7401 AND {refseq_table}.txEnd>=df_variants.POS 7402 ) 7403 ) 7404 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7405 """ 7406 self.conn.query(transcripts_query) 7407 with open(f"{tmpdir}/transcript.tsv") as infile: 7408 transcripts = read_transcripts(infile) 7409 7410 # Polars connexion 7411 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7412 7413 log.debug("Genome loading...") 7414 # Read genome sequence using pyfaidx. 7415 genome = Fasta(genome_file) 7416 7417 log.debug("Start annotation HGVS...") 7418 7419 # Create 7420 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7421 ddf = dd.from_pandas(df_variants, npartitions=threads) 7422 7423 # Use dask.dataframe.apply() to apply function on each partition 7424 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7425 7426 # Convert Dask DataFrame to Pandas Dataframe 7427 df = ddf.compute() 7428 7429 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7430 with tempfile.TemporaryDirectory() as tmpdir: 7431 df_parquet = os.path.join(tmpdir, "df.parquet") 7432 df.to_parquet(df_parquet) 7433 7434 # Update hgvs column 7435 update_variant_query = f""" 7436 UPDATE {table_variants} 7437 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7438 FROM read_parquet('{df_parquet}') as df 7439 WHERE variants."#CHROM" = df.CHROM 7440 AND variants.POS = df.POS 7441 AND variants.REF = df.REF 7442 AND variants.ALT = df.ALT 7443 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7444 """ 7445 self.execute_query(update_variant_query) 7446 7447 # Update INFO column 7448 sql_query_update = f""" 7449 UPDATE {table_variants} 7450 SET INFO = 7451 concat( 7452 CASE 7453 WHEN INFO NOT IN ('','.') 7454 THEN concat(INFO, ';') 7455 ELSE '' 7456 END, 7457 'hgvs=', 7458 {hgvs_column_name} 7459 ) 7460 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7461 """ 7462 self.execute_query(sql_query_update) 7463 7464 # Add header 7465 HGVS_INFOS = { 7466 "hgvs": { 7467 "ID": "hgvs", 7468 "Number": ".", 7469 "Type": "String", 7470 "Description": f"HGVS annotatation with HOWARD", 7471 } 7472 } 7473 7474 for field in HGVS_INFOS: 7475 field_ID = HGVS_INFOS[field]["ID"] 7476 field_description = HGVS_INFOS[field]["Description"] 7477 self.get_header().infos[field_ID] = vcf.parser._Info( 7478 field_ID, 7479 HGVS_INFOS[field]["Number"], 7480 HGVS_INFOS[field]["Type"], 7481 field_description, 7482 "unknown", 7483 "unknown", 7484 code_type_map[HGVS_INFOS[field]["Type"]], 7485 ) 7486 7487 # Remove added columns 7488 for added_column in added_columns: 7489 self.drop_column(column=added_column) 7490 7491 ### 7492 # Calculation 7493 ### 7494 7495 def get_operations_help( 7496 self, operations_config_dict: dict = {}, operations_config_file: str = None 7497 ) -> list: 7498 7499 # Init 7500 operations_help = [] 7501 7502 # operations 7503 operations = self.get_config_json( 7504 name="calculations", 7505 config_dict=operations_config_dict, 7506 config_file=operations_config_file, 7507 ) 7508 for op in operations: 7509 op_name = operations[op].get("name", op).upper() 7510 op_description = operations[op].get("description", op_name) 7511 op_available = operations[op].get("available", False) 7512 if op_available: 7513 operations_help.append(f" {op_name}: {op_description}") 7514 7515 # Sort operations 7516 operations_help.sort() 7517 7518 # insert header 7519 operations_help.insert(0, "Available calculation operations:") 7520 7521 # Return 7522 return operations_help 7523 7524 def calculation( 7525 self, 7526 operations: dict = {}, 7527 operations_config_dict: dict = {}, 7528 operations_config_file: str = None, 7529 ) -> None: 7530 """ 7531 It takes a list of operations, and for each operation, it checks if it's a python or sql 7532 operation, and then calls the appropriate function 7533 7534 param json example: 7535 "calculation": { 7536 "NOMEN": { 7537 "options": { 7538 "hgvs_field": "hgvs" 7539 }, 7540 "middle" : null 7541 } 7542 """ 7543 7544 # Param 7545 param = self.get_param() 7546 7547 # operations config 7548 operations_config = self.get_config_json( 7549 name="calculations", 7550 config_dict=operations_config_dict, 7551 config_file=operations_config_file, 7552 ) 7553 7554 # Upper keys 7555 operations_config = {k.upper(): v for k, v in operations_config.items()} 7556 7557 # Calculations 7558 7559 # Operations from param 7560 operations = param.get("calculation", {}).get("calculations", operations) 7561 7562 # Quick calculation - add 7563 if param.get("calculations", None): 7564 calculations_list = [ 7565 value for value in param.get("calculations", "").split(",") 7566 ] 7567 log.info(f"Quick Calculations:") 7568 for calculation_key in calculations_list: 7569 log.info(f" {calculation_key}") 7570 for calculation_operation in calculations_list: 7571 if calculation_operation.upper() not in operations: 7572 operations[calculation_operation.upper()] = {} 7573 add_value_into_dict( 7574 dict_tree=param, 7575 sections=[ 7576 "calculation", 7577 "calculations", 7578 calculation_operation.upper(), 7579 ], 7580 value={}, 7581 ) 7582 7583 # Operations for calculation 7584 if not operations: 7585 operations = param.get("calculation", {}).get("calculations", {}) 7586 7587 if operations: 7588 log.info(f"Calculations...") 7589 7590 # For each operations 7591 for operation_name in operations: 7592 operation_name = operation_name.upper() 7593 if operation_name not in [""]: 7594 if operation_name in operations_config: 7595 log.info(f"Calculation '{operation_name}'") 7596 operation = operations_config[operation_name] 7597 operation_type = operation.get("type", "sql") 7598 if operation_type == "python": 7599 self.calculation_process_function( 7600 operation=operation, operation_name=operation_name 7601 ) 7602 elif operation_type == "sql": 7603 self.calculation_process_sql( 7604 operation=operation, operation_name=operation_name 7605 ) 7606 else: 7607 log.error( 7608 f"Operations config: Type '{operation_type}' NOT available" 7609 ) 7610 raise ValueError( 7611 f"Operations config: Type '{operation_type}' NOT available" 7612 ) 7613 else: 7614 log.error( 7615 f"Operations config: Calculation '{operation_name}' NOT available" 7616 ) 7617 raise ValueError( 7618 f"Operations config: Calculation '{operation_name}' NOT available" 7619 ) 7620 7621 # Explode INFOS fields into table fields 7622 if self.get_explode_infos(): 7623 self.explode_infos( 7624 prefix=self.get_explode_infos_prefix(), 7625 fields=self.get_explode_infos_fields(), 7626 force=True, 7627 ) 7628 7629 def calculation_process_sql( 7630 self, operation: dict, operation_name: str = "unknown" 7631 ) -> None: 7632 """ 7633 The `calculation_process_sql` function takes in a mathematical operation as a string and 7634 performs the operation, updating the specified table with the result. 7635 7636 :param operation: The `operation` parameter is a dictionary that contains information about the 7637 mathematical operation to be performed. It includes the following keys: 7638 :type operation: dict 7639 :param operation_name: The `operation_name` parameter is a string that represents the name of 7640 the mathematical operation being performed. It is used for logging and error handling purposes, 7641 defaults to unknown 7642 :type operation_name: str (optional) 7643 """ 7644 7645 # table variants 7646 table_variants = self.get_table_variants(clause="alter") 7647 7648 # Operation infos 7649 operation_name = operation.get("name", "unknown") 7650 log.debug(f"process sql {operation_name}") 7651 output_column_name = operation.get("output_column_name", operation_name) 7652 output_column_type = operation.get("output_column_type", "String") 7653 prefix = operation.get("explode_infos_prefix", "") 7654 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7655 output_column_description = operation.get( 7656 "output_column_description", f"{operation_name} operation" 7657 ) 7658 operation_query = operation.get("operation_query", None) 7659 if isinstance(operation_query, list): 7660 operation_query = " ".join(operation_query) 7661 operation_info_fields = operation.get("info_fields", []) 7662 operation_info_fields_check = operation.get("info_fields_check", False) 7663 operation_info = operation.get("operation_info", True) 7664 7665 if operation_query: 7666 7667 # Info fields check 7668 operation_info_fields_check_result = True 7669 if operation_info_fields_check: 7670 header_infos = self.get_header().infos 7671 for info_field in operation_info_fields: 7672 operation_info_fields_check_result = ( 7673 operation_info_fields_check_result 7674 and info_field in header_infos 7675 ) 7676 7677 # If info fields available 7678 if operation_info_fields_check_result: 7679 7680 # Added_columns 7681 added_columns = [] 7682 7683 # Create VCF header field 7684 vcf_reader = self.get_header() 7685 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7686 output_column_name, 7687 ".", 7688 output_column_type, 7689 output_column_description, 7690 "howard calculation", 7691 "0", 7692 self.code_type_map.get(output_column_type), 7693 ) 7694 7695 # Explode infos if needed 7696 log.debug(f"calculation_process_sql prefix {prefix}") 7697 added_columns += self.explode_infos( 7698 prefix=prefix, 7699 fields=[output_column_name] + operation_info_fields, 7700 force=True, 7701 ) 7702 7703 # Create column 7704 added_column = self.add_column( 7705 table_name=table_variants, 7706 column_name=prefix + output_column_name, 7707 column_type=output_column_type_sql, 7708 default_value="null", 7709 ) 7710 added_columns.append(added_column) 7711 7712 # Operation calculation 7713 try: 7714 7715 # Query to update calculation column 7716 sql_update = f""" 7717 UPDATE {table_variants} 7718 SET "{prefix}{output_column_name}" = ({operation_query}) 7719 """ 7720 self.conn.execute(sql_update) 7721 7722 # Add to INFO 7723 if operation_info: 7724 sql_update_info = f""" 7725 UPDATE {table_variants} 7726 SET "INFO" = 7727 concat( 7728 CASE 7729 WHEN "INFO" IS NOT NULL 7730 THEN concat("INFO", ';') 7731 ELSE '' 7732 END, 7733 '{output_column_name}=', 7734 "{prefix}{output_column_name}" 7735 ) 7736 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7737 """ 7738 self.conn.execute(sql_update_info) 7739 7740 except: 7741 log.error( 7742 f"Operations config: Calculation '{operation_name}' query failed" 7743 ) 7744 raise ValueError( 7745 f"Operations config: Calculation '{operation_name}' query failed" 7746 ) 7747 7748 # Remove added columns 7749 for added_column in added_columns: 7750 log.debug(f"added_column: {added_column}") 7751 self.drop_column(column=added_column) 7752 7753 else: 7754 log.error( 7755 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7756 ) 7757 raise ValueError( 7758 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7759 ) 7760 7761 else: 7762 log.error( 7763 f"Operations config: Calculation '{operation_name}' query NOT defined" 7764 ) 7765 raise ValueError( 7766 f"Operations config: Calculation '{operation_name}' query NOT defined" 7767 ) 7768 7769 def calculation_process_function( 7770 self, operation: dict, operation_name: str = "unknown" 7771 ) -> None: 7772 """ 7773 The `calculation_process_function` takes in an operation dictionary and performs the specified 7774 function with the given parameters. 7775 7776 :param operation: The `operation` parameter is a dictionary that contains information about the 7777 operation to be performed. It has the following keys: 7778 :type operation: dict 7779 :param operation_name: The `operation_name` parameter is a string that represents the name of 7780 the operation being performed. It is used for logging purposes, defaults to unknown 7781 :type operation_name: str (optional) 7782 """ 7783 7784 operation_name = operation["name"] 7785 log.debug(f"process sql {operation_name}") 7786 function_name = operation["function_name"] 7787 function_params = operation["function_params"] 7788 getattr(self, function_name)(*function_params) 7789 7790 def calculation_variant_id(self) -> None: 7791 """ 7792 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7793 updates the INFO field of a variants table with the variant ID. 7794 """ 7795 7796 # variant_id annotation field 7797 variant_id_tag = self.get_variant_id_column() 7798 added_columns = [variant_id_tag] 7799 7800 # variant_id hgvs tags" 7801 vcf_infos_tags = { 7802 variant_id_tag: "howard variant ID annotation", 7803 } 7804 7805 # Variants table 7806 table_variants = self.get_table_variants() 7807 7808 # Header 7809 vcf_reader = self.get_header() 7810 7811 # Add variant_id to header 7812 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7813 variant_id_tag, 7814 ".", 7815 "String", 7816 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7817 "howard calculation", 7818 "0", 7819 self.code_type_map.get("String"), 7820 ) 7821 7822 # Update 7823 sql_update = f""" 7824 UPDATE {table_variants} 7825 SET "INFO" = 7826 concat( 7827 CASE 7828 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7829 THEN '' 7830 ELSE concat("INFO", ';') 7831 END, 7832 '{variant_id_tag}=', 7833 "{variant_id_tag}" 7834 ) 7835 """ 7836 self.conn.execute(sql_update) 7837 7838 # Remove added columns 7839 for added_column in added_columns: 7840 self.drop_column(column=added_column) 7841 7842 def calculation_extract_snpeff_hgvs( 7843 self, 7844 snpeff_hgvs: str = "snpeff_hgvs", 7845 snpeff_field: str = "ANN", 7846 ) -> None: 7847 """ 7848 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7849 annotation field in a VCF file and adds them as a new column in the variants table. 7850 7851 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7852 function is used to specify the name of the column that will store the HGVS nomenclatures 7853 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7854 snpeff_hgvs 7855 :type snpeff_hgvs: str (optional) 7856 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7857 function represents the field in the VCF file that contains SnpEff annotations. This field is 7858 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7859 to ANN 7860 :type snpeff_field: str (optional) 7861 """ 7862 7863 # Snpeff hgvs tags 7864 vcf_infos_tags = { 7865 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7866 } 7867 7868 # Prefix 7869 prefix = self.get_explode_infos_prefix() 7870 if prefix: 7871 prefix = "INFO/" 7872 7873 # snpEff fields 7874 speff_ann_infos = prefix + snpeff_field 7875 speff_hgvs_infos = prefix + snpeff_hgvs 7876 7877 # Variants table 7878 table_variants = self.get_table_variants() 7879 7880 # Header 7881 vcf_reader = self.get_header() 7882 7883 # Add columns 7884 added_columns = [] 7885 7886 # Explode HGVS field in column 7887 added_columns += self.explode_infos(fields=[snpeff_field]) 7888 7889 if snpeff_field in vcf_reader.infos: 7890 7891 log.debug(vcf_reader.infos[snpeff_field]) 7892 7893 # Extract ANN header 7894 ann_description = vcf_reader.infos[snpeff_field].desc 7895 pattern = r"'(.+?)'" 7896 match = re.search(pattern, ann_description) 7897 if match: 7898 ann_header_match = match.group(1).split(" | ") 7899 ann_header_desc = {} 7900 for i in range(len(ann_header_match)): 7901 ann_header_info = "".join( 7902 char for char in ann_header_match[i] if char.isalnum() 7903 ) 7904 ann_header_desc[ann_header_info] = ann_header_match[i] 7905 if not ann_header_desc: 7906 raise ValueError("Invalid header description format") 7907 else: 7908 raise ValueError("Invalid header description format") 7909 7910 # Create variant id 7911 variant_id_column = self.get_variant_id_column() 7912 added_columns += [variant_id_column] 7913 7914 # Create dataframe 7915 dataframe_snpeff_hgvs = self.get_query_to_df( 7916 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7917 ) 7918 7919 # Create main NOMEN column 7920 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7921 speff_ann_infos 7922 ].apply( 7923 lambda x: extract_snpeff_hgvs( 7924 str(x), header=list(ann_header_desc.values()) 7925 ) 7926 ) 7927 7928 # Add snpeff_hgvs to header 7929 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7930 snpeff_hgvs, 7931 ".", 7932 "String", 7933 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7934 "howard calculation", 7935 "0", 7936 self.code_type_map.get("String"), 7937 ) 7938 7939 # Update 7940 sql_update = f""" 7941 UPDATE variants 7942 SET "INFO" = 7943 concat( 7944 CASE 7945 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7946 THEN '' 7947 ELSE concat("INFO", ';') 7948 END, 7949 CASE 7950 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7951 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7952 THEN concat( 7953 '{snpeff_hgvs}=', 7954 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7955 ) 7956 ELSE '' 7957 END 7958 ) 7959 FROM dataframe_snpeff_hgvs 7960 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7961 7962 """ 7963 self.conn.execute(sql_update) 7964 7965 # Delete dataframe 7966 del dataframe_snpeff_hgvs 7967 gc.collect() 7968 7969 else: 7970 7971 log.warning( 7972 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7973 ) 7974 7975 # Remove added columns 7976 for added_column in added_columns: 7977 self.drop_column(column=added_column) 7978 7979 def calculation_snpeff_ann_explode( 7980 self, 7981 uniquify: bool = True, 7982 output_format: str = "fields", 7983 output_prefix: str = "snpeff_", 7984 snpeff_field: str = "ANN", 7985 ) -> None: 7986 """ 7987 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7988 exploding the HGVS field and updating variant information accordingly. 7989 7990 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 7991 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 7992 it indicates that the output should be unique, meaning that duplicate entries should be removed, 7993 defaults to True 7994 :type uniquify: bool (optional) 7995 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 7996 function specifies the format in which the output annotations will be generated. It has a 7997 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 7998 format, defaults to fields 7999 :type output_format: str (optional) 8000 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8001 method is used to specify the prefix that will be added to the output annotations generated 8002 during the calculation process. This prefix helps to differentiate the newly added annotations 8003 from existing ones in the output data. By default, the, defaults to ANN_ 8004 :type output_prefix: str (optional) 8005 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8006 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8007 field will be processed to explode the HGVS annotations and update the variant information 8008 accordingly, defaults to ANN 8009 :type snpeff_field: str (optional) 8010 """ 8011 8012 # SnpEff annotation field 8013 snpeff_hgvs = "snpeff_ann_explode" 8014 8015 # Snpeff hgvs tags 8016 vcf_infos_tags = { 8017 snpeff_hgvs: "Explode snpEff annotations", 8018 } 8019 8020 # Prefix 8021 prefix = self.get_explode_infos_prefix() 8022 if prefix: 8023 prefix = "INFO/" 8024 8025 # snpEff fields 8026 speff_ann_infos = prefix + snpeff_field 8027 speff_hgvs_infos = prefix + snpeff_hgvs 8028 8029 # Variants table 8030 table_variants = self.get_table_variants() 8031 8032 # Header 8033 vcf_reader = self.get_header() 8034 8035 # Add columns 8036 added_columns = [] 8037 8038 # Explode HGVS field in column 8039 added_columns += self.explode_infos(fields=[snpeff_field]) 8040 log.debug(f"snpeff_field={snpeff_field}") 8041 log.debug(f"added_columns={added_columns}") 8042 8043 if snpeff_field in vcf_reader.infos: 8044 8045 # Extract ANN header 8046 ann_description = vcf_reader.infos[snpeff_field].desc 8047 pattern = r"'(.+?)'" 8048 match = re.search(pattern, ann_description) 8049 if match: 8050 ann_header_match = match.group(1).split(" | ") 8051 ann_header = [] 8052 ann_header_desc = {} 8053 for i in range(len(ann_header_match)): 8054 ann_header_info = "".join( 8055 char for char in ann_header_match[i] if char.isalnum() 8056 ) 8057 ann_header.append(ann_header_info) 8058 ann_header_desc[ann_header_info] = ann_header_match[i] 8059 if not ann_header_desc: 8060 raise ValueError("Invalid header description format") 8061 else: 8062 raise ValueError("Invalid header description format") 8063 8064 # Create variant id 8065 variant_id_column = self.get_variant_id_column() 8066 added_columns += [variant_id_column] 8067 8068 # Create dataframe 8069 dataframe_snpeff_hgvs = self.get_query_to_df( 8070 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8071 ) 8072 8073 # Create snpEff columns 8074 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8075 speff_ann_infos 8076 ].apply( 8077 lambda x: explode_snpeff_ann( 8078 str(x), 8079 uniquify=uniquify, 8080 output_format=output_format, 8081 prefix=output_prefix, 8082 header=list(ann_header_desc.values()), 8083 ) 8084 ) 8085 8086 # Header 8087 ann_annotations_prefix = "" 8088 if output_format.upper() in ["JSON"]: 8089 ann_annotations_prefix = f"{output_prefix}=" 8090 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8091 output_prefix, 8092 ".", 8093 "String", 8094 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8095 + " - JSON format", 8096 "howard calculation", 8097 "0", 8098 self.code_type_map.get("String"), 8099 ) 8100 else: 8101 for ann_annotation in ann_header: 8102 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8103 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8104 ann_annotation_id, 8105 ".", 8106 "String", 8107 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8108 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8109 "howard calculation", 8110 "0", 8111 self.code_type_map.get("String"), 8112 ) 8113 8114 # Update 8115 sql_update = f""" 8116 UPDATE variants 8117 SET "INFO" = 8118 concat( 8119 CASE 8120 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8121 THEN '' 8122 ELSE concat("INFO", ';') 8123 END, 8124 CASE 8125 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8126 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8127 THEN concat( 8128 '{ann_annotations_prefix}', 8129 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8130 ) 8131 ELSE '' 8132 END 8133 ) 8134 FROM dataframe_snpeff_hgvs 8135 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8136 8137 """ 8138 self.conn.execute(sql_update) 8139 8140 # Delete dataframe 8141 del dataframe_snpeff_hgvs 8142 gc.collect() 8143 8144 else: 8145 8146 log.warning( 8147 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8148 ) 8149 8150 # Remove added columns 8151 for added_column in added_columns: 8152 self.drop_column(column=added_column) 8153 8154 def calculation_extract_nomen(self) -> None: 8155 """ 8156 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8157 """ 8158 8159 # NOMEN field 8160 field_nomen_dict = "NOMEN_DICT" 8161 8162 # NOMEN structure 8163 nomen_dict = { 8164 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8165 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8166 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8167 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8168 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8169 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8170 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8171 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8172 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8173 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8174 } 8175 8176 # Param 8177 param = self.get_param() 8178 8179 # Prefix 8180 prefix = self.get_explode_infos_prefix() 8181 8182 # Header 8183 vcf_reader = self.get_header() 8184 8185 # Get HGVS field 8186 hgvs_field = ( 8187 param.get("calculation", {}) 8188 .get("calculations", {}) 8189 .get("NOMEN", {}) 8190 .get("options", {}) 8191 .get("hgvs_field", "hgvs") 8192 ) 8193 8194 # Get transcripts 8195 transcripts_file = ( 8196 param.get("calculation", {}) 8197 .get("calculations", {}) 8198 .get("NOMEN", {}) 8199 .get("options", {}) 8200 .get("transcripts", None) 8201 ) 8202 transcripts_file = full_path(transcripts_file) 8203 transcripts = [] 8204 if transcripts_file: 8205 if os.path.exists(transcripts_file): 8206 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8207 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8208 else: 8209 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8210 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8211 8212 # Added columns 8213 added_columns = [] 8214 8215 # Explode HGVS field in column 8216 added_columns += self.explode_infos(fields=[hgvs_field]) 8217 8218 # extra infos 8219 extra_infos = self.get_extra_infos() 8220 extra_field = prefix + hgvs_field 8221 8222 if extra_field in extra_infos: 8223 8224 # Create dataframe 8225 dataframe_hgvs = self.get_query_to_df( 8226 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8227 ) 8228 8229 # Create main NOMEN column 8230 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8231 lambda x: find_nomen(str(x), transcripts=transcripts) 8232 ) 8233 8234 # Explode NOMEN Structure and create SQL set for update 8235 sql_nomen_fields = [] 8236 for nomen_field in nomen_dict: 8237 8238 # Explode each field into a column 8239 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8240 lambda x: dict(x).get(nomen_field, "") 8241 ) 8242 8243 # Create VCF header field 8244 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8245 nomen_field, 8246 ".", 8247 "String", 8248 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8249 "howard calculation", 8250 "0", 8251 self.code_type_map.get("String"), 8252 ) 8253 sql_nomen_fields.append( 8254 f""" 8255 CASE 8256 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8257 THEN concat( 8258 ';{nomen_field}=', 8259 dataframe_hgvs."{nomen_field}" 8260 ) 8261 ELSE '' 8262 END 8263 """ 8264 ) 8265 8266 # SQL set for update 8267 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8268 8269 # Update 8270 sql_update = f""" 8271 UPDATE variants 8272 SET "INFO" = 8273 concat( 8274 CASE 8275 WHEN "INFO" IS NULL 8276 THEN '' 8277 ELSE "INFO" 8278 END, 8279 {sql_nomen_fields_set} 8280 ) 8281 FROM dataframe_hgvs 8282 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8283 AND variants."POS" = dataframe_hgvs."POS" 8284 AND variants."REF" = dataframe_hgvs."REF" 8285 AND variants."ALT" = dataframe_hgvs."ALT" 8286 """ 8287 self.conn.execute(sql_update) 8288 8289 # Delete dataframe 8290 del dataframe_hgvs 8291 gc.collect() 8292 8293 # Remove added columns 8294 for added_column in added_columns: 8295 self.drop_column(column=added_column) 8296 8297 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8298 """ 8299 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8300 pipeline/sample for a variant and updates the variant information in a VCF file. 8301 8302 :param tag: The `tag` parameter is a string that represents the annotation field for the 8303 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8304 VCF header and to update the corresponding field in the variants table, defaults to 8305 findbypipeline 8306 :type tag: str (optional) 8307 """ 8308 8309 # if FORMAT and samples 8310 if ( 8311 "FORMAT" in self.get_header_columns_as_list() 8312 and self.get_header_sample_list() 8313 ): 8314 8315 # findbypipeline annotation field 8316 findbypipeline_tag = tag 8317 8318 # VCF infos tags 8319 vcf_infos_tags = { 8320 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8321 } 8322 8323 # Prefix 8324 prefix = self.get_explode_infos_prefix() 8325 8326 # Field 8327 findbypipeline_infos = prefix + findbypipeline_tag 8328 8329 # Variants table 8330 table_variants = self.get_table_variants() 8331 8332 # Header 8333 vcf_reader = self.get_header() 8334 8335 # Create variant id 8336 variant_id_column = self.get_variant_id_column() 8337 added_columns = [variant_id_column] 8338 8339 # variant_id, FORMAT and samples 8340 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8341 self.get_header_sample_list() 8342 ) 8343 8344 # Create dataframe 8345 dataframe_findbypipeline = self.get_query_to_df( 8346 f""" SELECT {samples_fields} FROM {table_variants} """ 8347 ) 8348 8349 # Create findbypipeline column 8350 dataframe_findbypipeline[findbypipeline_infos] = ( 8351 dataframe_findbypipeline.apply( 8352 lambda row: findbypipeline( 8353 row, samples=self.get_header_sample_list() 8354 ), 8355 axis=1, 8356 ) 8357 ) 8358 8359 # Add snpeff_hgvs to header 8360 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8361 findbypipeline_tag, 8362 ".", 8363 "String", 8364 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8365 "howard calculation", 8366 "0", 8367 self.code_type_map.get("String"), 8368 ) 8369 8370 # Update 8371 sql_update = f""" 8372 UPDATE variants 8373 SET "INFO" = 8374 concat( 8375 CASE 8376 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8377 THEN '' 8378 ELSE concat("INFO", ';') 8379 END, 8380 CASE 8381 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8382 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8383 THEN concat( 8384 '{findbypipeline_tag}=', 8385 dataframe_findbypipeline."{findbypipeline_infos}" 8386 ) 8387 ELSE '' 8388 END 8389 ) 8390 FROM dataframe_findbypipeline 8391 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8392 """ 8393 self.conn.execute(sql_update) 8394 8395 # Remove added columns 8396 for added_column in added_columns: 8397 self.drop_column(column=added_column) 8398 8399 # Delete dataframe 8400 del dataframe_findbypipeline 8401 gc.collect() 8402 8403 def calculation_genotype_concordance(self) -> None: 8404 """ 8405 The function `calculation_genotype_concordance` calculates the genotype concordance for 8406 multi-caller VCF files and updates the variant information in the database. 8407 """ 8408 8409 # if FORMAT and samples 8410 if ( 8411 "FORMAT" in self.get_header_columns_as_list() 8412 and self.get_header_sample_list() 8413 ): 8414 8415 # genotypeconcordance annotation field 8416 genotypeconcordance_tag = "genotypeconcordance" 8417 8418 # VCF infos tags 8419 vcf_infos_tags = { 8420 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8421 } 8422 8423 # Prefix 8424 prefix = self.get_explode_infos_prefix() 8425 8426 # Field 8427 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8428 8429 # Variants table 8430 table_variants = self.get_table_variants() 8431 8432 # Header 8433 vcf_reader = self.get_header() 8434 8435 # Create variant id 8436 variant_id_column = self.get_variant_id_column() 8437 added_columns = [variant_id_column] 8438 8439 # variant_id, FORMAT and samples 8440 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8441 self.get_header_sample_list() 8442 ) 8443 8444 # Create dataframe 8445 dataframe_genotypeconcordance = self.get_query_to_df( 8446 f""" SELECT {samples_fields} FROM {table_variants} """ 8447 ) 8448 8449 # Create genotypeconcordance column 8450 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8451 dataframe_genotypeconcordance.apply( 8452 lambda row: genotypeconcordance( 8453 row, samples=self.get_header_sample_list() 8454 ), 8455 axis=1, 8456 ) 8457 ) 8458 8459 # Add genotypeconcordance to header 8460 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8461 genotypeconcordance_tag, 8462 ".", 8463 "String", 8464 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8465 "howard calculation", 8466 "0", 8467 self.code_type_map.get("String"), 8468 ) 8469 8470 # Update 8471 sql_update = f""" 8472 UPDATE variants 8473 SET "INFO" = 8474 concat( 8475 CASE 8476 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8477 THEN '' 8478 ELSE concat("INFO", ';') 8479 END, 8480 CASE 8481 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8482 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8483 THEN concat( 8484 '{genotypeconcordance_tag}=', 8485 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8486 ) 8487 ELSE '' 8488 END 8489 ) 8490 FROM dataframe_genotypeconcordance 8491 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8492 """ 8493 self.conn.execute(sql_update) 8494 8495 # Remove added columns 8496 for added_column in added_columns: 8497 self.drop_column(column=added_column) 8498 8499 # Delete dataframe 8500 del dataframe_genotypeconcordance 8501 gc.collect() 8502 8503 def calculation_barcode(self, tag: str = "barcode") -> None: 8504 """ 8505 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8506 updates the INFO field in the file with the calculated barcode values. 8507 8508 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8509 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8510 the default tag name is set to "barcode", defaults to barcode 8511 :type tag: str (optional) 8512 """ 8513 8514 # if FORMAT and samples 8515 if ( 8516 "FORMAT" in self.get_header_columns_as_list() 8517 and self.get_header_sample_list() 8518 ): 8519 8520 # barcode annotation field 8521 if not tag: 8522 tag = "barcode" 8523 8524 # VCF infos tags 8525 vcf_infos_tags = { 8526 tag: "barcode calculation (VaRank)", 8527 } 8528 8529 # Prefix 8530 prefix = self.get_explode_infos_prefix() 8531 8532 # Field 8533 barcode_infos = prefix + tag 8534 8535 # Variants table 8536 table_variants = self.get_table_variants() 8537 8538 # Header 8539 vcf_reader = self.get_header() 8540 8541 # Create variant id 8542 variant_id_column = self.get_variant_id_column() 8543 added_columns = [variant_id_column] 8544 8545 # variant_id, FORMAT and samples 8546 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8547 self.get_header_sample_list() 8548 ) 8549 8550 # Create dataframe 8551 dataframe_barcode = self.get_query_to_df( 8552 f""" SELECT {samples_fields} FROM {table_variants} """ 8553 ) 8554 8555 # Create barcode column 8556 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8557 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8558 ) 8559 8560 # Add barcode to header 8561 vcf_reader.infos[tag] = vcf.parser._Info( 8562 tag, 8563 ".", 8564 "String", 8565 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8566 "howard calculation", 8567 "0", 8568 self.code_type_map.get("String"), 8569 ) 8570 8571 # Update 8572 sql_update = f""" 8573 UPDATE {table_variants} 8574 SET "INFO" = 8575 concat( 8576 CASE 8577 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8578 THEN '' 8579 ELSE concat("INFO", ';') 8580 END, 8581 CASE 8582 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8583 AND dataframe_barcode."{barcode_infos}" NOT NULL 8584 THEN concat( 8585 '{tag}=', 8586 dataframe_barcode."{barcode_infos}" 8587 ) 8588 ELSE '' 8589 END 8590 ) 8591 FROM dataframe_barcode 8592 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8593 """ 8594 self.conn.execute(sql_update) 8595 8596 # Remove added columns 8597 for added_column in added_columns: 8598 self.drop_column(column=added_column) 8599 8600 # Delete dataframe 8601 del dataframe_barcode 8602 gc.collect() 8603 8604 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8605 """ 8606 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8607 and updates the INFO field in the file with the calculated barcode values. 8608 8609 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8610 the barcode tag that will be added to the VCF file during the calculation process. If no value 8611 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8612 :type tag: str (optional) 8613 """ 8614 8615 # if FORMAT and samples 8616 if ( 8617 "FORMAT" in self.get_header_columns_as_list() 8618 and self.get_header_sample_list() 8619 ): 8620 8621 # barcode annotation field 8622 if not tag: 8623 tag = "BCF" 8624 8625 # VCF infos tags 8626 vcf_infos_tags = { 8627 tag: "barcode family calculation", 8628 f"{tag}S": "barcode family samples", 8629 } 8630 8631 # Param 8632 param = self.get_param() 8633 log.debug(f"param={param}") 8634 8635 # Prefix 8636 prefix = self.get_explode_infos_prefix() 8637 8638 # PED param 8639 ped = ( 8640 param.get("calculation", {}) 8641 .get("calculations", {}) 8642 .get("BARCODEFAMILY", {}) 8643 .get("family_pedigree", None) 8644 ) 8645 log.debug(f"ped={ped}") 8646 8647 # Load PED 8648 if ped: 8649 8650 # Pedigree is a file 8651 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8652 log.debug("Pedigree is file") 8653 with open(full_path(ped)) as ped: 8654 ped = json.load(ped) 8655 8656 # Pedigree is a string 8657 elif isinstance(ped, str): 8658 log.debug("Pedigree is str") 8659 try: 8660 ped = json.loads(ped) 8661 log.debug("Pedigree is json str") 8662 except ValueError as e: 8663 ped_samples = ped.split(",") 8664 ped = {} 8665 for ped_sample in ped_samples: 8666 ped[ped_sample] = ped_sample 8667 8668 # Pedigree is a dict 8669 elif isinstance(ped, dict): 8670 log.debug("Pedigree is dict") 8671 8672 # Pedigree is not well formatted 8673 else: 8674 msg_error = "Pedigree not well formatted" 8675 log.error(msg_error) 8676 raise ValueError(msg_error) 8677 8678 # Construct list 8679 ped_samples = list(ped.values()) 8680 8681 else: 8682 log.debug("Pedigree not defined. Take all samples") 8683 ped_samples = self.get_header_sample_list() 8684 ped = {} 8685 for ped_sample in ped_samples: 8686 ped[ped_sample] = ped_sample 8687 8688 # Check pedigree 8689 if not ped or len(ped) == 0: 8690 msg_error = f"Error in pedigree: samples {ped_samples}" 8691 log.error(msg_error) 8692 raise ValueError(msg_error) 8693 8694 # Log 8695 log.info( 8696 "Calculation 'BARCODEFAMILY' - Samples: " 8697 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8698 ) 8699 log.debug(f"ped_samples={ped_samples}") 8700 8701 # Field 8702 barcode_infos = prefix + tag 8703 8704 # Variants table 8705 table_variants = self.get_table_variants() 8706 8707 # Header 8708 vcf_reader = self.get_header() 8709 8710 # Create variant id 8711 variant_id_column = self.get_variant_id_column() 8712 added_columns = [variant_id_column] 8713 8714 # variant_id, FORMAT and samples 8715 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8716 ped_samples 8717 ) 8718 8719 # Create dataframe 8720 dataframe_barcode = self.get_query_to_df( 8721 f""" SELECT {samples_fields} FROM {table_variants} """ 8722 ) 8723 8724 # Create barcode column 8725 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8726 lambda row: barcode(row, samples=ped_samples), axis=1 8727 ) 8728 8729 # Add barcode family to header 8730 # Add vaf_normalization to header 8731 vcf_reader.formats[tag] = vcf.parser._Format( 8732 id=tag, 8733 num=".", 8734 type="String", 8735 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8736 type_code=self.code_type_map.get("String"), 8737 ) 8738 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8739 id=f"{tag}S", 8740 num=".", 8741 type="String", 8742 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8743 type_code=self.code_type_map.get("String"), 8744 ) 8745 8746 # Update 8747 # for sample in ped_samples: 8748 sql_update_set = [] 8749 for sample in self.get_header_sample_list() + ["FORMAT"]: 8750 if sample in ped_samples: 8751 value = f'dataframe_barcode."{barcode_infos}"' 8752 value_samples = "'" + ",".join(ped_samples) + "'" 8753 elif sample == "FORMAT": 8754 value = f"'{tag}'" 8755 value_samples = f"'{tag}S'" 8756 else: 8757 value = "'.'" 8758 value_samples = "'.'" 8759 format_regex = r"[a-zA-Z0-9\s]" 8760 sql_update_set.append( 8761 f""" 8762 "{sample}" = 8763 concat( 8764 CASE 8765 WHEN {table_variants}."{sample}" = './.' 8766 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8767 ELSE {table_variants}."{sample}" 8768 END, 8769 ':', 8770 {value}, 8771 ':', 8772 {value_samples} 8773 ) 8774 """ 8775 ) 8776 8777 sql_update_set_join = ", ".join(sql_update_set) 8778 sql_update = f""" 8779 UPDATE {table_variants} 8780 SET {sql_update_set_join} 8781 FROM dataframe_barcode 8782 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8783 """ 8784 self.conn.execute(sql_update) 8785 8786 # Remove added columns 8787 for added_column in added_columns: 8788 self.drop_column(column=added_column) 8789 8790 # Delete dataframe 8791 del dataframe_barcode 8792 gc.collect() 8793 8794 def calculation_trio(self) -> None: 8795 """ 8796 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8797 information to the INFO field of each variant. 8798 """ 8799 8800 # if FORMAT and samples 8801 if ( 8802 "FORMAT" in self.get_header_columns_as_list() 8803 and self.get_header_sample_list() 8804 ): 8805 8806 # trio annotation field 8807 trio_tag = "trio" 8808 8809 # VCF infos tags 8810 vcf_infos_tags = { 8811 "trio": "trio calculation", 8812 } 8813 8814 # Param 8815 param = self.get_param() 8816 8817 # Prefix 8818 prefix = self.get_explode_infos_prefix() 8819 8820 # Trio param 8821 trio_ped = ( 8822 param.get("calculation", {}) 8823 .get("calculations", {}) 8824 .get("TRIO", {}) 8825 .get("trio_pedigree", None) 8826 ) 8827 8828 # Load trio 8829 if trio_ped: 8830 8831 # Trio pedigree is a file 8832 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8833 log.debug("TRIO pedigree is file") 8834 with open(full_path(trio_ped)) as trio_ped: 8835 trio_ped = json.load(trio_ped) 8836 8837 # Trio pedigree is a string 8838 elif isinstance(trio_ped, str): 8839 log.debug("TRIO pedigree is str") 8840 try: 8841 trio_ped = json.loads(trio_ped) 8842 log.debug("TRIO pedigree is json str") 8843 except ValueError as e: 8844 trio_samples = trio_ped.split(",") 8845 if len(trio_samples) == 3: 8846 trio_ped = { 8847 "father": trio_samples[0], 8848 "mother": trio_samples[1], 8849 "child": trio_samples[2], 8850 } 8851 log.debug("TRIO pedigree is list str") 8852 else: 8853 msg_error = "TRIO pedigree not well formatted" 8854 log.error(msg_error) 8855 raise ValueError(msg_error) 8856 8857 # Trio pedigree is a dict 8858 elif isinstance(trio_ped, dict): 8859 log.debug("TRIO pedigree is dict") 8860 8861 # Trio pedigree is not well formatted 8862 else: 8863 msg_error = "TRIO pedigree not well formatted" 8864 log.error(msg_error) 8865 raise ValueError(msg_error) 8866 8867 # Construct trio list 8868 trio_samples = [ 8869 trio_ped.get("father", ""), 8870 trio_ped.get("mother", ""), 8871 trio_ped.get("child", ""), 8872 ] 8873 8874 else: 8875 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8876 samples_list = self.get_header_sample_list() 8877 if len(samples_list) >= 3: 8878 trio_samples = self.get_header_sample_list()[0:3] 8879 trio_ped = { 8880 "father": trio_samples[0], 8881 "mother": trio_samples[1], 8882 "child": trio_samples[2], 8883 } 8884 else: 8885 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8886 log.error(msg_error) 8887 raise ValueError(msg_error) 8888 8889 # Check trio pedigree 8890 if not trio_ped or len(trio_ped) != 3: 8891 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8892 log.error(msg_error) 8893 raise ValueError(msg_error) 8894 8895 # Log 8896 log.info( 8897 f"Calculation 'TRIO' - Samples: " 8898 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8899 ) 8900 8901 # Field 8902 trio_infos = prefix + trio_tag 8903 8904 # Variants table 8905 table_variants = self.get_table_variants() 8906 8907 # Header 8908 vcf_reader = self.get_header() 8909 8910 # Create variant id 8911 variant_id_column = self.get_variant_id_column() 8912 added_columns = [variant_id_column] 8913 8914 # variant_id, FORMAT and samples 8915 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8916 self.get_header_sample_list() 8917 ) 8918 8919 # Create dataframe 8920 dataframe_trio = self.get_query_to_df( 8921 f""" SELECT {samples_fields} FROM {table_variants} """ 8922 ) 8923 8924 # Create trio column 8925 dataframe_trio[trio_infos] = dataframe_trio.apply( 8926 lambda row: trio(row, samples=trio_samples), axis=1 8927 ) 8928 8929 # Add trio to header 8930 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8931 trio_tag, 8932 ".", 8933 "String", 8934 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8935 "howard calculation", 8936 "0", 8937 self.code_type_map.get("String"), 8938 ) 8939 8940 # Update 8941 sql_update = f""" 8942 UPDATE {table_variants} 8943 SET "INFO" = 8944 concat( 8945 CASE 8946 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8947 THEN '' 8948 ELSE concat("INFO", ';') 8949 END, 8950 CASE 8951 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8952 AND dataframe_trio."{trio_infos}" NOT NULL 8953 THEN concat( 8954 '{trio_tag}=', 8955 dataframe_trio."{trio_infos}" 8956 ) 8957 ELSE '' 8958 END 8959 ) 8960 FROM dataframe_trio 8961 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8962 """ 8963 self.conn.execute(sql_update) 8964 8965 # Remove added columns 8966 for added_column in added_columns: 8967 self.drop_column(column=added_column) 8968 8969 # Delete dataframe 8970 del dataframe_trio 8971 gc.collect() 8972 8973 def calculation_vaf_normalization(self) -> None: 8974 """ 8975 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8976 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8977 :return: The function does not return anything. 8978 """ 8979 8980 # if FORMAT and samples 8981 if ( 8982 "FORMAT" in self.get_header_columns_as_list() 8983 and self.get_header_sample_list() 8984 ): 8985 8986 # vaf_normalization annotation field 8987 vaf_normalization_tag = "VAF" 8988 8989 # VCF infos tags 8990 vcf_infos_tags = { 8991 "VAF": "VAF Variant Frequency", 8992 } 8993 8994 # Prefix 8995 prefix = self.get_explode_infos_prefix() 8996 8997 # Variants table 8998 table_variants = self.get_table_variants() 8999 9000 # Header 9001 vcf_reader = self.get_header() 9002 9003 # Do not calculate if VAF already exists 9004 if "VAF" in vcf_reader.formats: 9005 log.debug("VAF already on genotypes") 9006 return 9007 9008 # Create variant id 9009 variant_id_column = self.get_variant_id_column() 9010 added_columns = [variant_id_column] 9011 9012 # variant_id, FORMAT and samples 9013 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9014 self.get_header_sample_list() 9015 ) 9016 9017 # Create dataframe 9018 dataframe_vaf_normalization = self.get_query_to_df( 9019 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9020 ) 9021 9022 vaf_normalization_set = [] 9023 9024 # for each sample vaf_normalization 9025 for sample in self.get_header_sample_list(): 9026 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9027 lambda row: vaf_normalization(row, sample=sample), axis=1 9028 ) 9029 vaf_normalization_set.append( 9030 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9031 ) 9032 9033 # Add VAF to FORMAT 9034 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9035 "FORMAT" 9036 ].apply(lambda x: str(x) + ":VAF") 9037 vaf_normalization_set.append( 9038 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9039 ) 9040 9041 # Add vaf_normalization to header 9042 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9043 id=vaf_normalization_tag, 9044 num="1", 9045 type="Float", 9046 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9047 type_code=self.code_type_map.get("Float"), 9048 ) 9049 9050 # Create fields to add in INFO 9051 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9052 9053 # Update 9054 sql_update = f""" 9055 UPDATE {table_variants} 9056 SET {sql_vaf_normalization_set} 9057 FROM dataframe_vaf_normalization 9058 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9059 9060 """ 9061 self.conn.execute(sql_update) 9062 9063 # Remove added columns 9064 for added_column in added_columns: 9065 self.drop_column(column=added_column) 9066 9067 # Delete dataframe 9068 del dataframe_vaf_normalization 9069 gc.collect() 9070 9071 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9072 """ 9073 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9074 field in a VCF file and updates the INFO column of the variants table with the calculated 9075 statistics. 9076 9077 :param info: The `info` parameter is a string that represents the type of information for which 9078 genotype statistics are calculated. It is used to generate various VCF info tags for the 9079 statistics, such as the number of occurrences, the list of values, the minimum value, the 9080 maximum value, the mean, the median, defaults to VAF 9081 :type info: str (optional) 9082 """ 9083 9084 # if FORMAT and samples 9085 if ( 9086 "FORMAT" in self.get_header_columns_as_list() 9087 and self.get_header_sample_list() 9088 ): 9089 9090 # vaf_stats annotation field 9091 vaf_stats_tag = info + "_stats" 9092 9093 # VCF infos tags 9094 vcf_infos_tags = { 9095 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9096 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9097 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9098 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9099 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9100 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9101 info 9102 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9103 } 9104 9105 # Prefix 9106 prefix = self.get_explode_infos_prefix() 9107 9108 # Field 9109 vaf_stats_infos = prefix + vaf_stats_tag 9110 9111 # Variants table 9112 table_variants = self.get_table_variants() 9113 9114 # Header 9115 vcf_reader = self.get_header() 9116 9117 # Create variant id 9118 variant_id_column = self.get_variant_id_column() 9119 added_columns = [variant_id_column] 9120 9121 # variant_id, FORMAT and samples 9122 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9123 self.get_header_sample_list() 9124 ) 9125 9126 # Create dataframe 9127 dataframe_vaf_stats = self.get_query_to_df( 9128 f""" SELECT {samples_fields} FROM {table_variants} """ 9129 ) 9130 9131 # Create vaf_stats column 9132 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9133 lambda row: genotype_stats( 9134 row, samples=self.get_header_sample_list(), info=info 9135 ), 9136 axis=1, 9137 ) 9138 9139 # List of vcf tags 9140 sql_vaf_stats_fields = [] 9141 9142 # Check all VAF stats infos 9143 for stat in vcf_infos_tags: 9144 9145 # Extract stats 9146 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9147 lambda x: dict(x).get(stat, "") 9148 ) 9149 9150 # Add snpeff_hgvs to header 9151 vcf_reader.infos[stat] = vcf.parser._Info( 9152 stat, 9153 ".", 9154 "String", 9155 vcf_infos_tags.get(stat, "genotype statistics"), 9156 "howard calculation", 9157 "0", 9158 self.code_type_map.get("String"), 9159 ) 9160 9161 if len(sql_vaf_stats_fields): 9162 sep = ";" 9163 else: 9164 sep = "" 9165 9166 # Create fields to add in INFO 9167 sql_vaf_stats_fields.append( 9168 f""" 9169 CASE 9170 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9171 THEN concat( 9172 '{sep}{stat}=', 9173 dataframe_vaf_stats."{stat}" 9174 ) 9175 ELSE '' 9176 END 9177 """ 9178 ) 9179 9180 # SQL set for update 9181 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9182 9183 # Update 9184 sql_update = f""" 9185 UPDATE variants 9186 SET "INFO" = 9187 concat( 9188 CASE 9189 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9190 THEN '' 9191 ELSE concat("INFO", ';') 9192 END, 9193 {sql_vaf_stats_fields_set} 9194 ) 9195 FROM dataframe_vaf_stats 9196 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9197 9198 """ 9199 self.conn.execute(sql_update) 9200 9201 # Remove added columns 9202 for added_column in added_columns: 9203 self.drop_column(column=added_column) 9204 9205 # Delete dataframe 9206 del dataframe_vaf_stats 9207 gc.collect()
34class Variants: 35 36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data() 80 81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "") 106 107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config 120 121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param 131 132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = [] 160 161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False) 169 170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config 211 212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict 236 237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db 266 267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 312 313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None 339 340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None 441 442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df 483 484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None 526 527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats 748 749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file 771 772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None 873 874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input 880 881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format 897 898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed 915 916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output 923 924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format 942 943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config 949 950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param 956 957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db 963 964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix 970 971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants 998 999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 ) 1010 1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory") 1018 1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn 1026 1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close() 1033 1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required 1053 1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0 1072 1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return "" 1083 1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return [] 1094 1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list) 1105 1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples 1113 1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False) 1122 1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format 1134 1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1188 1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes() 1384 1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False) 1394 1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if fields_search != [field]: 1466 fields_search = sorted( 1467 list(set(fields_search).difference(fields_input)) 1468 ) 1469 1470 # If field is not in header (avoid not well formatted header) 1471 if not fields_search and not remove_fields_not_in_header: 1472 fields_search = [field] 1473 1474 # Add found fields 1475 for new_field in fields_search: 1476 # Add field, if not already exists, and if it is in header (if asked) 1477 if ( 1478 new_field not in fields_output 1479 and ( 1480 not remove_fields_not_in_header 1481 or new_field in fields_in_header 1482 ) 1483 and new_field not in [".*"] 1484 ): 1485 fields_output.append(new_field) 1486 1487 return fields_output 1488 1489 else: 1490 1491 return [] 1492 1493 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1494 """ 1495 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1496 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1497 not provided. 1498 1499 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1500 prefix to be used for exploding or expanding information 1501 :type explode_infos_prefix: str 1502 :return: the value of the variable `explode_infos_prefix`. 1503 """ 1504 1505 if not explode_infos_prefix: 1506 explode_infos_prefix = ( 1507 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1508 ) 1509 1510 return explode_infos_prefix 1511 1512 def add_column( 1513 self, 1514 table_name, 1515 column_name, 1516 column_type, 1517 default_value=None, 1518 drop: bool = False, 1519 ) -> dict: 1520 """ 1521 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1522 doesn't already exist. 1523 1524 :param table_name: The name of the table to which you want to add a column 1525 :param column_name: The parameter "column_name" is the name of the column that you want to add 1526 to the table 1527 :param column_type: The `column_type` parameter specifies the data type of the column that you 1528 want to add to the table. It should be a string that represents the desired data type, such as 1529 "INTEGER", "TEXT", "REAL", etc 1530 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1531 default value for the newly added column. If a default value is provided, it will be assigned to 1532 the column for any existing rows that do not have a value for that column 1533 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1534 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1535 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1536 to False 1537 :type drop: bool (optional) 1538 :return: a boolean value indicating whether the column was successfully added to the table. 1539 """ 1540 1541 # added 1542 added = False 1543 dropped = False 1544 1545 # Check if the column already exists in the table 1546 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1547 columns = self.get_query_to_df(query).columns.tolist() 1548 if column_name in columns: 1549 log.debug( 1550 f"The {column_name} column already exists in the {table_name} table" 1551 ) 1552 if drop: 1553 self.drop_column(table_name=table_name, column_name=column_name) 1554 dropped = True 1555 else: 1556 return None 1557 else: 1558 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1559 1560 # Add column in table 1561 add_column_query = ( 1562 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1563 ) 1564 if default_value is not None: 1565 add_column_query += f" DEFAULT {default_value}" 1566 self.execute_query(add_column_query) 1567 added = not dropped 1568 log.debug( 1569 f"The {column_name} column was successfully added to the {table_name} table" 1570 ) 1571 1572 if added: 1573 added_column = { 1574 "table_name": table_name, 1575 "column_name": column_name, 1576 "column_type": column_type, 1577 "default_value": default_value, 1578 } 1579 else: 1580 added_column = None 1581 1582 return added_column 1583 1584 def drop_column( 1585 self, column: dict = None, table_name: str = None, column_name: str = None 1586 ) -> bool: 1587 """ 1588 The `drop_column` function drops a specified column from a given table in a database and returns 1589 True if the column was successfully dropped, and False if the column does not exist in the 1590 table. 1591 1592 :param column: The `column` parameter is a dictionary that contains information about the column 1593 you want to drop. It has two keys: 1594 :type column: dict 1595 :param table_name: The `table_name` parameter is the name of the table from which you want to 1596 drop a column 1597 :type table_name: str 1598 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1599 from the table 1600 :type column_name: str 1601 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1602 and False if the column does not exist in the table. 1603 """ 1604 1605 # Find column infos 1606 if column: 1607 if isinstance(column, dict): 1608 table_name = column.get("table_name", None) 1609 column_name = column.get("column_name", None) 1610 elif isinstance(column, str): 1611 table_name = self.get_table_variants() 1612 column_name = column 1613 else: 1614 table_name = None 1615 column_name = None 1616 1617 if not table_name and not column_name: 1618 return False 1619 1620 # Removed 1621 removed = False 1622 1623 # Check if the column already exists in the table 1624 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1625 columns = self.get_query_to_df(query).columns.tolist() 1626 if column_name in columns: 1627 log.debug(f"The {column_name} column exists in the {table_name} table") 1628 else: 1629 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1630 return False 1631 1632 # Add column in table # ALTER TABLE integers DROP k 1633 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1634 self.execute_query(add_column_query) 1635 removed = True 1636 log.debug( 1637 f"The {column_name} column was successfully dropped to the {table_name} table" 1638 ) 1639 1640 return removed 1641 1642 def explode_infos( 1643 self, 1644 prefix: str = None, 1645 create_index: bool = False, 1646 fields: list = None, 1647 force: bool = False, 1648 proccess_all_fields_together: bool = False, 1649 ) -> list: 1650 """ 1651 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1652 columns, returning a list of added columns. 1653 1654 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1655 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1656 `self.get_explode_infos_prefix()` as the prefix 1657 :type prefix: str 1658 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1659 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1660 `False`, indexes will not be created. The default value is `False`, defaults to False 1661 :type create_index: bool (optional) 1662 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1663 individual columns. If this parameter is not provided, all INFO fields will be exploded 1664 :type fields: list 1665 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1666 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1667 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1668 defaults to False 1669 :type force: bool (optional) 1670 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1671 flag that determines whether to process all the INFO fields together or individually. If set to 1672 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1673 be processed individually, defaults to False 1674 :type proccess_all_fields_together: bool (optional) 1675 :return: The function `explode_infos` returns a list of added columns. 1676 """ 1677 1678 # drop indexes 1679 self.drop_indexes() 1680 1681 # connexion format 1682 connexion_format = self.get_connexion_format() 1683 1684 # Access 1685 access = self.get_config().get("access", None) 1686 1687 # Added columns 1688 added_columns = [] 1689 1690 if access not in ["RO"]: 1691 1692 # prefix 1693 if prefix in [None, True] or not isinstance(prefix, str): 1694 if self.get_explode_infos_prefix() not in [None, True]: 1695 prefix = self.get_explode_infos_prefix() 1696 else: 1697 prefix = "INFO/" 1698 1699 # table variants 1700 table_variants = self.get_table_variants(clause="select") 1701 1702 # extra infos 1703 try: 1704 extra_infos = self.get_extra_infos() 1705 except: 1706 extra_infos = [] 1707 1708 # Header infos 1709 header_infos = self.get_header().infos 1710 1711 log.debug( 1712 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1713 ) 1714 1715 sql_info_alter_table_array = [] 1716 1717 # Info fields to check 1718 fields_list = list(header_infos) 1719 if fields: 1720 fields_list += fields 1721 fields_list = set(fields_list) 1722 1723 # If no fields 1724 if not fields: 1725 fields = [] 1726 1727 # Translate fields if patterns 1728 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1729 1730 for info in fields: 1731 1732 info_id_sql = prefix + info 1733 1734 if ( 1735 info in fields_list 1736 or prefix + info in fields_list 1737 or info in extra_infos 1738 ): 1739 1740 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1741 1742 if info in header_infos: 1743 info_type = header_infos[info].type 1744 info_num = header_infos[info].num 1745 else: 1746 info_type = "String" 1747 info_num = 0 1748 1749 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1750 if info_num != 1: 1751 type_sql = "VARCHAR" 1752 1753 # Add field 1754 added_column = self.add_column( 1755 table_name=table_variants, 1756 column_name=info_id_sql, 1757 column_type=type_sql, 1758 default_value="null", 1759 drop=force, 1760 ) 1761 1762 if added_column: 1763 added_columns.append(added_column) 1764 1765 if added_column or force: 1766 1767 # add field to index 1768 self.index_additionnal_fields.append(info_id_sql) 1769 1770 # Update field array 1771 if connexion_format in ["duckdb"]: 1772 update_info_field = f""" 1773 "{info_id_sql}" = 1774 CASE 1775 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1776 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1777 END 1778 """ 1779 elif connexion_format in ["sqlite"]: 1780 update_info_field = f""" 1781 "{info_id_sql}" = 1782 CASE 1783 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1784 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1785 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1786 END 1787 """ 1788 1789 sql_info_alter_table_array.append(update_info_field) 1790 1791 if sql_info_alter_table_array: 1792 1793 # By chromosomes 1794 try: 1795 chromosomes_list = list( 1796 self.get_query_to_df( 1797 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1798 )["#CHROM"] 1799 ) 1800 except: 1801 chromosomes_list = [None] 1802 1803 for chrom in chromosomes_list: 1804 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1805 1806 # Where clause 1807 where_clause = "" 1808 if chrom and len(chromosomes_list) > 1: 1809 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1810 1811 # Update table 1812 if proccess_all_fields_together: 1813 sql_info_alter_table_array_join = ", ".join( 1814 sql_info_alter_table_array 1815 ) 1816 if sql_info_alter_table_array_join: 1817 sql_info_alter_table = f""" 1818 UPDATE {table_variants} 1819 SET {sql_info_alter_table_array_join} 1820 {where_clause} 1821 """ 1822 log.debug( 1823 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1824 ) 1825 # log.debug(sql_info_alter_table) 1826 self.conn.execute(sql_info_alter_table) 1827 else: 1828 sql_info_alter_num = 0 1829 for sql_info_alter in sql_info_alter_table_array: 1830 sql_info_alter_num += 1 1831 sql_info_alter_table = f""" 1832 UPDATE {table_variants} 1833 SET {sql_info_alter} 1834 {where_clause} 1835 """ 1836 log.debug( 1837 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1838 ) 1839 # log.debug(sql_info_alter_table) 1840 self.conn.execute(sql_info_alter_table) 1841 1842 # create indexes 1843 if create_index: 1844 self.create_indexes() 1845 1846 return added_columns 1847 1848 def create_indexes(self) -> None: 1849 """ 1850 Create indexes on the table after insertion 1851 """ 1852 1853 # Access 1854 access = self.get_config().get("access", None) 1855 1856 # get table variants 1857 table_variants = self.get_table_variants("FROM") 1858 1859 if self.get_indexing() and access not in ["RO"]: 1860 # Create index 1861 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1862 self.conn.execute(sql_create_table_index) 1863 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1864 self.conn.execute(sql_create_table_index) 1865 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1866 self.conn.execute(sql_create_table_index) 1867 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1868 self.conn.execute(sql_create_table_index) 1869 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1870 self.conn.execute(sql_create_table_index) 1871 for field in self.index_additionnal_fields: 1872 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1873 self.conn.execute(sql_create_table_index) 1874 1875 def drop_indexes(self) -> None: 1876 """ 1877 Create indexes on the table after insertion 1878 """ 1879 1880 # Access 1881 access = self.get_config().get("access", None) 1882 1883 # get table variants 1884 table_variants = self.get_table_variants("FROM") 1885 1886 # Get database format 1887 connexion_format = self.get_connexion_format() 1888 1889 if access not in ["RO"]: 1890 if connexion_format in ["duckdb"]: 1891 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1892 elif connexion_format in ["sqlite"]: 1893 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1894 1895 list_indexes = self.conn.execute(sql_list_indexes) 1896 index_names = [row[0] for row in list_indexes.fetchall()] 1897 for index in index_names: 1898 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1899 self.conn.execute(sql_drop_table_index) 1900 1901 def read_vcf_header(self, f) -> list: 1902 """ 1903 It reads the header of a VCF file and returns a list of the header lines 1904 1905 :param f: the file object 1906 :return: The header lines of the VCF file. 1907 """ 1908 1909 header_list = [] 1910 for line in f: 1911 header_list.append(line) 1912 if line.startswith("#CHROM"): 1913 break 1914 return header_list 1915 1916 def read_vcf_header_file(self, file: str = None) -> list: 1917 """ 1918 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1919 uncompressed files. 1920 1921 :param file: The `file` parameter is a string that represents the path to the VCF header file 1922 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1923 default to `None` 1924 :type file: str 1925 :return: The function `read_vcf_header_file` returns a list. 1926 """ 1927 1928 if self.get_input_compressed(input_file=file): 1929 with bgzf.open(file, "rt") as f: 1930 return self.read_vcf_header(f=f) 1931 else: 1932 with open(file, "rt") as f: 1933 return self.read_vcf_header(f=f) 1934 1935 def execute_query(self, query: str): 1936 """ 1937 It takes a query as an argument, executes it, and returns the results 1938 1939 :param query: The query to be executed 1940 :return: The result of the query is being returned. 1941 """ 1942 if query: 1943 return self.conn.execute(query) # .fetchall() 1944 else: 1945 return None 1946 1947 def export_output( 1948 self, 1949 output_file: str | None = None, 1950 output_header: str | None = None, 1951 export_header: bool = True, 1952 query: str | None = None, 1953 parquet_partitions: list | None = None, 1954 chunk_size: int | None = None, 1955 threads: int | None = None, 1956 sort: bool = False, 1957 index: bool = False, 1958 order_by: str | None = None, 1959 ) -> bool: 1960 """ 1961 The `export_output` function exports data from a VCF file to a specified output file in various 1962 formats, including VCF, CSV, TSV, PSV, and Parquet. 1963 1964 :param output_file: The `output_file` parameter is a string that specifies the name of the 1965 output file to be generated by the function. This is where the exported data will be saved 1966 :type output_file: str 1967 :param output_header: The `output_header` parameter is a string that specifies the name of the 1968 file where the header of the VCF file will be exported. If this parameter is not provided, the 1969 header will be exported to a file with the same name as the `output_file` parameter, but with 1970 the extension " 1971 :type output_header: str 1972 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1973 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1974 True, the header will be exported to a file. If `export_header` is False, the header will not 1975 be, defaults to True, if output format is not VCF 1976 :type export_header: bool (optional) 1977 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1978 select specific data from the VCF file before exporting it. If provided, only the data that 1979 matches the query will be exported 1980 :type query: str 1981 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1982 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1983 organize data in a hierarchical directory structure based on the values of one or more columns. 1984 This can improve query performance when working with large datasets 1985 :type parquet_partitions: list 1986 :param chunk_size: The `chunk_size` parameter specifies the number of 1987 records in batch when exporting data in Parquet format. This parameter is used for 1988 partitioning the Parquet file into multiple files. 1989 :type chunk_size: int 1990 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1991 threads to be used during the export process. It determines the level of parallelism and can 1992 improve the performance of the export operation. If not provided, the function will use the 1993 default number of threads 1994 :type threads: int 1995 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1996 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1997 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1998 False 1999 :type sort: bool (optional) 2000 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2001 created on the output file. If `index` is True, an index will be created. If `index` is False, 2002 no index will be created. The default value is False, defaults to False 2003 :type index: bool (optional) 2004 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2005 sorting the output file. This parameter is only applicable when exporting data in VCF format 2006 :type order_by: str 2007 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2008 None if it doesn't. 2009 """ 2010 2011 # Log 2012 log.info("Exporting...") 2013 2014 # Full path 2015 output_file = full_path(output_file) 2016 output_header = full_path(output_header) 2017 2018 # Config 2019 config = self.get_config() 2020 2021 # Param 2022 param = self.get_param() 2023 2024 # Tmp files to remove 2025 tmp_to_remove = [] 2026 2027 # If no output, get it 2028 if not output_file: 2029 output_file = self.get_output() 2030 2031 # If not threads 2032 if not threads: 2033 threads = self.get_threads() 2034 2035 # Auto header name with extension 2036 if export_header or output_header: 2037 if not output_header: 2038 output_header = f"{output_file}.hdr" 2039 # Export header 2040 self.export_header(output_file=output_file) 2041 2042 # Switch off export header if VCF output 2043 output_file_type = get_file_format(output_file) 2044 if output_file_type in ["vcf"]: 2045 export_header = False 2046 tmp_to_remove.append(output_header) 2047 2048 # Chunk size 2049 if not chunk_size: 2050 chunk_size = config.get("chunk_size", None) 2051 2052 # Parquet partition 2053 if not parquet_partitions: 2054 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2055 if parquet_partitions and isinstance(parquet_partitions, str): 2056 parquet_partitions = parquet_partitions.split(",") 2057 2058 # Order by 2059 if not order_by: 2060 order_by = param.get("export", {}).get("order_by", "") 2061 2062 # Header in output 2063 header_in_output = param.get("export", {}).get("include_header", False) 2064 2065 # Database 2066 database_source = self.get_connexion() 2067 2068 # Connexion format 2069 connexion_format = self.get_connexion_format() 2070 2071 # Explode infos 2072 if self.get_explode_infos(): 2073 self.explode_infos( 2074 prefix=self.get_explode_infos_prefix(), 2075 fields=self.get_explode_infos_fields(), 2076 force=False, 2077 ) 2078 2079 # if connexion_format in ["sqlite"] or query: 2080 if connexion_format in ["sqlite"]: 2081 2082 # Export in Parquet 2083 random_tmp = "".join( 2084 random.choice(string.ascii_lowercase) for i in range(10) 2085 ) 2086 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2087 tmp_to_remove.append(database_source) 2088 2089 # Table Variants 2090 table_variants = self.get_table_variants() 2091 2092 # Create export query 2093 sql_query_export_subquery = f""" 2094 SELECT * FROM {table_variants} 2095 """ 2096 2097 # Write source file 2098 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2099 2100 # Create database 2101 database = Database( 2102 database=database_source, 2103 table="variants", 2104 header_file=output_header, 2105 conn_config=self.get_connexion_config(), 2106 ) 2107 2108 # Existing colomns header 2109 # existing_columns_header = database.get_header_file_columns(output_header) 2110 existing_columns_header = database.get_header_columns_from_database() 2111 2112 # Export file 2113 database.export( 2114 output_database=output_file, 2115 output_header=output_header, 2116 existing_columns_header=existing_columns_header, 2117 parquet_partitions=parquet_partitions, 2118 chunk_size=chunk_size, 2119 threads=threads, 2120 sort=sort, 2121 index=index, 2122 header_in_output=header_in_output, 2123 order_by=order_by, 2124 query=query, 2125 export_header=export_header, 2126 ) 2127 2128 # Remove 2129 remove_if_exists(tmp_to_remove) 2130 2131 return (os.path.exists(output_file) or None) and ( 2132 os.path.exists(output_file) or None 2133 ) 2134 2135 def get_extra_infos(self, table: str = None) -> list: 2136 """ 2137 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2138 in the header. 2139 2140 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2141 name of the table from which you want to retrieve the extra columns that are not present in the 2142 header. If the `table` parameter is not provided when calling the function, it will default to 2143 using the variants 2144 :type table: str 2145 :return: A list of columns that are in the specified table but not in the header of the table. 2146 """ 2147 2148 header_columns = [] 2149 2150 if not table: 2151 table = self.get_table_variants(clause="from") 2152 header_columns = self.get_header_columns() 2153 2154 # Check all columns in the database 2155 query = f""" SELECT * FROM {table} LIMIT 1 """ 2156 log.debug(f"query {query}") 2157 table_columns = self.get_query_to_df(query).columns.tolist() 2158 extra_columns = [] 2159 2160 # Construct extra infos (not in header) 2161 for column in table_columns: 2162 if column not in header_columns: 2163 extra_columns.append(column) 2164 2165 return extra_columns 2166 2167 def get_extra_infos_sql(self, table: str = None) -> str: 2168 """ 2169 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2170 by double quotes 2171 2172 :param table: The name of the table to get the extra infos from. If None, the default table is 2173 used 2174 :type table: str 2175 :return: A string of the extra infos 2176 """ 2177 2178 return ", ".join( 2179 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2180 ) 2181 2182 def export_header( 2183 self, 2184 header_name: str = None, 2185 output_file: str = None, 2186 output_file_ext: str = ".hdr", 2187 clean_header: bool = True, 2188 remove_chrom_line: bool = False, 2189 ) -> str: 2190 """ 2191 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2192 specified options, and writes it to a new file. 2193 2194 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2195 this parameter is not specified, the header will be written to the output file 2196 :type header_name: str 2197 :param output_file: The `output_file` parameter in the `export_header` function is used to 2198 specify the name of the output file where the header will be written. If this parameter is not 2199 provided, the header will be written to a temporary file 2200 :type output_file: str 2201 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2202 string that represents the extension of the output header file. By default, it is set to ".hdr" 2203 if not specified by the user. This extension will be appended to the `output_file` name to 2204 create the final, defaults to .hdr 2205 :type output_file_ext: str (optional) 2206 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2207 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2208 `True`, the function will clean the header by modifying certain lines based on a specific 2209 pattern. If `clean_header`, defaults to True 2210 :type clean_header: bool (optional) 2211 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2212 boolean flag that determines whether the #CHROM line should be removed from the header before 2213 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2214 defaults to False 2215 :type remove_chrom_line: bool (optional) 2216 :return: The function `export_header` returns the name of the temporary header file that is 2217 created. 2218 """ 2219 2220 if not header_name and not output_file: 2221 output_file = self.get_output() 2222 2223 if self.get_header(): 2224 2225 # Get header object 2226 header_obj = self.get_header() 2227 2228 # Create database 2229 db_for_header = Database(database=self.get_input()) 2230 2231 # Get real columns in the file 2232 db_header_columns = db_for_header.get_columns() 2233 2234 with tempfile.TemporaryDirectory() as tmpdir: 2235 2236 # Write header file 2237 header_file_tmp = os.path.join(tmpdir, "header") 2238 f = open(header_file_tmp, "w") 2239 vcf.Writer(f, header_obj) 2240 f.close() 2241 2242 # Replace #CHROM line with rel columns 2243 header_list = db_for_header.read_header_file( 2244 header_file=header_file_tmp 2245 ) 2246 header_list[-1] = "\t".join(db_header_columns) 2247 2248 # Remove CHROM line 2249 if remove_chrom_line: 2250 header_list.pop() 2251 2252 # Clean header 2253 if clean_header: 2254 header_list_clean = [] 2255 for head in header_list: 2256 # Clean head for malformed header 2257 head_clean = head 2258 head_clean = re.subn( 2259 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2260 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2261 head_clean, 2262 2, 2263 )[0] 2264 # Write header 2265 header_list_clean.append(head_clean) 2266 header_list = header_list_clean 2267 2268 tmp_header_name = output_file + output_file_ext 2269 2270 f = open(tmp_header_name, "w") 2271 for line in header_list: 2272 f.write(line) 2273 f.close() 2274 2275 return tmp_header_name 2276 2277 def export_variant_vcf( 2278 self, 2279 vcf_file, 2280 remove_info: bool = False, 2281 add_samples: bool = True, 2282 list_samples: list = [], 2283 where_clause: str = "", 2284 index: bool = False, 2285 threads: int | None = None, 2286 ) -> bool | None: 2287 """ 2288 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2289 remove INFO field, add samples, and control compression and indexing. 2290 2291 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2292 written to. It is the output file that will contain the filtered VCF data based on the specified 2293 parameters 2294 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2295 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2296 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2297 in, defaults to False 2298 :type remove_info: bool (optional) 2299 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2300 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2301 If set to False, the samples will be removed. The default value is True, defaults to True 2302 :type add_samples: bool (optional) 2303 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2304 in the output VCF file. By default, all samples will be included. If you provide a list of 2305 samples, only those samples will be included in the output file 2306 :type list_samples: list 2307 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2308 determines whether or not to create an index for the output VCF file. If `index` is set to 2309 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2310 :type index: bool (optional) 2311 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2312 number of threads to use for exporting the VCF file. It determines how many parallel threads 2313 will be used during the export process. More threads can potentially speed up the export process 2314 by utilizing multiple cores of the processor. If 2315 :type threads: int | None 2316 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2317 method with various parameters including the output file, query, threads, sort flag, and index 2318 flag. The `export_output` method is responsible for exporting the VCF data based on the 2319 specified parameters and configurations provided in the `export_variant_vcf` function. 2320 """ 2321 2322 # Config 2323 config = self.get_config() 2324 2325 # Extract VCF 2326 log.debug("Export VCF...") 2327 2328 # Table variants 2329 table_variants = self.get_table_variants() 2330 2331 # Threads 2332 if not threads: 2333 threads = self.get_threads() 2334 2335 # Info fields 2336 if remove_info: 2337 if not isinstance(remove_info, str): 2338 remove_info = "." 2339 info_field = f"""'{remove_info}' as INFO""" 2340 else: 2341 info_field = "INFO" 2342 2343 # Samples fields 2344 if add_samples: 2345 if not list_samples: 2346 list_samples = self.get_header_sample_list() 2347 if list_samples: 2348 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2349 else: 2350 samples_fields = "" 2351 log.debug(f"samples_fields: {samples_fields}") 2352 else: 2353 samples_fields = "" 2354 2355 # Where clause 2356 if where_clause is None: 2357 where_clause = "" 2358 2359 # Variants 2360 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2361 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2362 log.debug(f"sql_query_select={sql_query_select}") 2363 2364 return self.export_output( 2365 output_file=vcf_file, 2366 output_header=None, 2367 export_header=True, 2368 query=sql_query_select, 2369 parquet_partitions=None, 2370 chunk_size=config.get("chunk_size", None), 2371 threads=threads, 2372 sort=True, 2373 index=index, 2374 order_by=None, 2375 ) 2376 2377 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2378 """ 2379 It takes a list of commands and runs them in parallel using the number of threads specified 2380 2381 :param commands: A list of commands to run 2382 :param threads: The number of threads to use, defaults to 1 (optional) 2383 """ 2384 2385 run_parallel_commands(commands, threads) 2386 2387 def get_threads(self, default: int = 1) -> int: 2388 """ 2389 This function returns the number of threads to use for a job, with a default value of 1 if not 2390 specified. 2391 2392 :param default: The `default` parameter in the `get_threads` method is used to specify the 2393 default number of threads to use if no specific value is provided. If no value is provided for 2394 the `threads` parameter in the configuration or input parameters, the `default` value will be 2395 used, defaults to 1 2396 :type default: int (optional) 2397 :return: the number of threads to use for the current job. 2398 """ 2399 2400 # Config 2401 config = self.get_config() 2402 2403 # Param 2404 param = self.get_param() 2405 2406 # Input threads 2407 input_thread = param.get("threads", config.get("threads", None)) 2408 2409 # Check threads 2410 if not input_thread: 2411 threads = default 2412 elif int(input_thread) <= 0: 2413 threads = os.cpu_count() 2414 else: 2415 threads = int(input_thread) 2416 return threads 2417 2418 def get_memory(self, default: str = None) -> str: 2419 """ 2420 This function retrieves the memory value from parameters or configuration with a default value 2421 if not found. 2422 2423 :param default: The `get_memory` function takes in a default value as a string parameter. This 2424 default value is used as a fallback in case the `memory` parameter is not provided in the 2425 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2426 the function 2427 :type default: str 2428 :return: The `get_memory` function returns a string value representing the memory parameter. If 2429 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2430 return the default value provided as an argument to the function. 2431 """ 2432 2433 # Config 2434 config = self.get_config() 2435 2436 # Param 2437 param = self.get_param() 2438 2439 # Input threads 2440 input_memory = param.get("memory", config.get("memory", None)) 2441 2442 # Check threads 2443 if input_memory: 2444 memory = input_memory 2445 else: 2446 memory = default 2447 2448 return memory 2449 2450 def update_from_vcf(self, vcf_file: str) -> None: 2451 """ 2452 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2453 2454 :param vcf_file: the path to the VCF file 2455 """ 2456 2457 connexion_format = self.get_connexion_format() 2458 2459 if connexion_format in ["duckdb"]: 2460 self.update_from_vcf_duckdb(vcf_file) 2461 elif connexion_format in ["sqlite"]: 2462 self.update_from_vcf_sqlite(vcf_file) 2463 2464 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2465 """ 2466 It takes a VCF file and updates the INFO column of the variants table in the database with the 2467 INFO column of the VCF file 2468 2469 :param vcf_file: the path to the VCF file 2470 """ 2471 2472 # varaints table 2473 table_variants = self.get_table_variants() 2474 2475 # Loading VCF into temporaire table 2476 skip = self.get_header_length(file=vcf_file) 2477 vcf_df = pd.read_csv( 2478 vcf_file, 2479 sep="\t", 2480 engine="c", 2481 skiprows=skip, 2482 header=0, 2483 low_memory=False, 2484 ) 2485 sql_query_update = f""" 2486 UPDATE {table_variants} as table_variants 2487 SET INFO = concat( 2488 CASE 2489 WHEN INFO NOT IN ('', '.') 2490 THEN INFO 2491 ELSE '' 2492 END, 2493 ( 2494 SELECT 2495 concat( 2496 CASE 2497 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2498 THEN ';' 2499 ELSE '' 2500 END 2501 , 2502 CASE 2503 WHEN table_parquet.INFO NOT IN ('','.') 2504 THEN table_parquet.INFO 2505 ELSE '' 2506 END 2507 ) 2508 FROM vcf_df as table_parquet 2509 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2510 AND table_parquet.\"POS\" = table_variants.\"POS\" 2511 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2512 AND table_parquet.\"REF\" = table_variants.\"REF\" 2513 AND table_parquet.INFO NOT IN ('','.') 2514 ) 2515 ) 2516 ; 2517 """ 2518 self.conn.execute(sql_query_update) 2519 2520 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2521 """ 2522 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2523 table, then updates the INFO column of the variants table with the INFO column of the temporary 2524 table 2525 2526 :param vcf_file: The path to the VCF file you want to update the database with 2527 """ 2528 2529 # Create a temporary table for the VCF 2530 table_vcf = "tmp_vcf" 2531 sql_create = ( 2532 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2533 ) 2534 self.conn.execute(sql_create) 2535 2536 # Loading VCF into temporaire table 2537 vcf_df = pd.read_csv( 2538 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2539 ) 2540 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2541 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2542 2543 # Update table 'variants' with VCF data 2544 # warning: CONCAT as || operator 2545 sql_query_update = f""" 2546 UPDATE variants as table_variants 2547 SET INFO = CASE 2548 WHEN INFO NOT IN ('', '.') 2549 THEN INFO 2550 ELSE '' 2551 END || 2552 ( 2553 SELECT 2554 CASE 2555 WHEN table_variants.INFO NOT IN ('','.') 2556 AND table_vcf.INFO NOT IN ('','.') 2557 THEN ';' 2558 ELSE '' 2559 END || 2560 CASE 2561 WHEN table_vcf.INFO NOT IN ('','.') 2562 THEN table_vcf.INFO 2563 ELSE '' 2564 END 2565 FROM {table_vcf} as table_vcf 2566 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2567 AND table_vcf.\"POS\" = table_variants.\"POS\" 2568 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2569 AND table_vcf.\"REF\" = table_variants.\"REF\" 2570 ) 2571 """ 2572 self.conn.execute(sql_query_update) 2573 2574 # Drop temporary table 2575 sql_drop = f"DROP TABLE {table_vcf}" 2576 self.conn.execute(sql_drop) 2577 2578 def drop_variants_table(self) -> None: 2579 """ 2580 > This function drops the variants table 2581 """ 2582 2583 table_variants = self.get_table_variants() 2584 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2585 self.conn.execute(sql_table_variants) 2586 2587 def set_variant_id( 2588 self, variant_id_column: str = "variant_id", force: bool = None 2589 ) -> str: 2590 """ 2591 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2592 `#CHROM`, `POS`, `REF`, and `ALT` columns 2593 2594 :param variant_id_column: The name of the column to be created in the variants table, defaults 2595 to variant_id 2596 :type variant_id_column: str (optional) 2597 :param force: If True, the variant_id column will be created even if it already exists 2598 :type force: bool 2599 :return: The name of the column that contains the variant_id 2600 """ 2601 2602 # Assembly 2603 assembly = self.get_param().get( 2604 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2605 ) 2606 2607 # INFO/Tag prefix 2608 prefix = self.get_explode_infos_prefix() 2609 2610 # Explode INFO/SVTYPE 2611 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2612 2613 # variants table 2614 table_variants = self.get_table_variants() 2615 2616 # variant_id column 2617 if not variant_id_column: 2618 variant_id_column = "variant_id" 2619 2620 # Creta variant_id column 2621 if "variant_id" not in self.get_extra_infos() or force: 2622 2623 # Create column 2624 self.add_column( 2625 table_name=table_variants, 2626 column_name=variant_id_column, 2627 column_type="UBIGINT", 2628 default_value="0", 2629 ) 2630 2631 # Update column 2632 self.conn.execute( 2633 f""" 2634 UPDATE {table_variants} 2635 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2636 """ 2637 ) 2638 2639 # Remove added columns 2640 for added_column in added_columns: 2641 self.drop_column(column=added_column) 2642 2643 # return variant_id column name 2644 return variant_id_column 2645 2646 def get_variant_id_column( 2647 self, variant_id_column: str = "variant_id", force: bool = None 2648 ) -> str: 2649 """ 2650 This function returns the variant_id column name 2651 2652 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2653 defaults to variant_id 2654 :type variant_id_column: str (optional) 2655 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2656 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2657 if it is not already set, or if it is set 2658 :type force: bool 2659 :return: The variant_id column name. 2660 """ 2661 2662 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2663 2664 ### 2665 # Annotation 2666 ### 2667 2668 def scan_databases( 2669 self, 2670 database_formats: list = ["parquet"], 2671 database_releases: list = ["current"], 2672 ) -> dict: 2673 """ 2674 The function `scan_databases` scans for available databases based on specified formats and 2675 releases. 2676 2677 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2678 of the databases to be scanned. In this case, the accepted format is "parquet" 2679 :type database_formats: list ["parquet"] 2680 :param database_releases: The `database_releases` parameter is a list that specifies the 2681 releases of the databases to be scanned. In the provided function, the default value for 2682 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2683 databases that are in the "current" 2684 :type database_releases: list 2685 :return: The function `scan_databases` returns a dictionary containing information about 2686 databases that match the specified formats and releases. 2687 """ 2688 2689 # Config 2690 config = self.get_config() 2691 2692 # Param 2693 param = self.get_param() 2694 2695 # Param - Assembly 2696 assembly = param.get("assembly", config.get("assembly", None)) 2697 if not assembly: 2698 assembly = DEFAULT_ASSEMBLY 2699 log.warning(f"Default assembly '{assembly}'") 2700 2701 # Scan for availabled databases 2702 log.info( 2703 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2704 ) 2705 databases_infos_dict = databases_infos( 2706 database_folder_releases=database_releases, 2707 database_formats=database_formats, 2708 assembly=assembly, 2709 config=config, 2710 ) 2711 log.info( 2712 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2713 ) 2714 2715 return databases_infos_dict 2716 2717 def annotation(self) -> None: 2718 """ 2719 It annotates the VCF file with the annotations specified in the config file. 2720 """ 2721 2722 # Config 2723 config = self.get_config() 2724 2725 # Param 2726 param = self.get_param() 2727 2728 # Param - Assembly 2729 assembly = param.get("assembly", config.get("assembly", None)) 2730 if not assembly: 2731 assembly = DEFAULT_ASSEMBLY 2732 log.warning(f"Default assembly '{assembly}'") 2733 2734 # annotations databases folders 2735 annotations_databases = set( 2736 config.get("folders", {}) 2737 .get("databases", {}) 2738 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2739 + config.get("folders", {}) 2740 .get("databases", {}) 2741 .get("parquet", ["~/howard/databases/parquet/current"]) 2742 + config.get("folders", {}) 2743 .get("databases", {}) 2744 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2745 ) 2746 2747 # Get param annotations 2748 if param.get("annotations", None) and isinstance( 2749 param.get("annotations", None), str 2750 ): 2751 log.debug(param.get("annotations", None)) 2752 param_annotation_list = param.get("annotations").split(",") 2753 else: 2754 param_annotation_list = [] 2755 2756 # Each tools param 2757 if param.get("annotation_parquet", None) != None: 2758 log.debug( 2759 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2760 ) 2761 if isinstance(param.get("annotation_parquet", None), list): 2762 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2763 else: 2764 param_annotation_list.append(param.get("annotation_parquet")) 2765 if param.get("annotation_snpsift", None) != None: 2766 if isinstance(param.get("annotation_snpsift", None), list): 2767 param_annotation_list.append( 2768 "snpsift:" 2769 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2770 ) 2771 else: 2772 param_annotation_list.append( 2773 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2774 ) 2775 if param.get("annotation_snpeff", None) != None: 2776 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2777 if param.get("annotation_bcftools", None) != None: 2778 if isinstance(param.get("annotation_bcftools", None), list): 2779 param_annotation_list.append( 2780 "bcftools:" 2781 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2786 ) 2787 if param.get("annotation_annovar", None) != None: 2788 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2789 if param.get("annotation_exomiser", None) != None: 2790 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2791 if param.get("annotation_splice", None) != None: 2792 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2793 2794 # Merge param annotations list 2795 param["annotations"] = ",".join(param_annotation_list) 2796 2797 # debug 2798 log.debug(f"param_annotations={param['annotations']}") 2799 2800 if param.get("annotations"): 2801 2802 # Log 2803 # log.info("Annotations - Check annotation parameters") 2804 2805 if not "annotation" in param: 2806 param["annotation"] = {} 2807 2808 # List of annotations parameters 2809 annotations_list_input = {} 2810 if isinstance(param.get("annotations", None), str): 2811 annotation_file_list = [ 2812 value for value in param.get("annotations", "").split(",") 2813 ] 2814 for annotation_file in annotation_file_list: 2815 annotations_list_input[annotation_file] = {"INFO": None} 2816 else: 2817 annotations_list_input = param.get("annotations", {}) 2818 2819 log.info(f"Quick Annotations:") 2820 for annotation_key in list(annotations_list_input.keys()): 2821 log.info(f" {annotation_key}") 2822 2823 # List of annotations and associated fields 2824 annotations_list = {} 2825 2826 for annotation_file in annotations_list_input: 2827 2828 # Explode annotations if ALL 2829 if ( 2830 annotation_file.upper() == "ALL" 2831 or annotation_file.upper().startswith("ALL:") 2832 ): 2833 2834 # check ALL parameters (formats, releases) 2835 annotation_file_split = annotation_file.split(":") 2836 database_formats = "parquet" 2837 database_releases = "current" 2838 for annotation_file_option in annotation_file_split[1:]: 2839 database_all_options_split = annotation_file_option.split("=") 2840 if database_all_options_split[0] == "format": 2841 database_formats = database_all_options_split[1].split("+") 2842 if database_all_options_split[0] == "release": 2843 database_releases = database_all_options_split[1].split("+") 2844 2845 # Scan for availabled databases 2846 databases_infos_dict = self.scan_databases( 2847 database_formats=database_formats, 2848 database_releases=database_releases, 2849 ) 2850 2851 # Add found databases in annotation parameters 2852 for database_infos in databases_infos_dict.keys(): 2853 annotations_list[database_infos] = {"INFO": None} 2854 2855 else: 2856 annotations_list[annotation_file] = annotations_list_input[ 2857 annotation_file 2858 ] 2859 2860 # Check each databases 2861 if len(annotations_list): 2862 2863 log.info( 2864 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2865 ) 2866 2867 for annotation_file in annotations_list: 2868 2869 # Init 2870 annotations = annotations_list.get(annotation_file, None) 2871 2872 # Annotation snpEff 2873 if annotation_file.startswith("snpeff"): 2874 2875 log.debug(f"Quick Annotation snpEff") 2876 2877 if "snpeff" not in param["annotation"]: 2878 param["annotation"]["snpeff"] = {} 2879 2880 if "options" not in param["annotation"]["snpeff"]: 2881 param["annotation"]["snpeff"]["options"] = "" 2882 2883 # snpEff options in annotations 2884 param["annotation"]["snpeff"]["options"] = "".join( 2885 annotation_file.split(":")[1:] 2886 ) 2887 2888 # Annotation Annovar 2889 elif annotation_file.startswith("annovar"): 2890 2891 log.debug(f"Quick Annotation Annovar") 2892 2893 if "annovar" not in param["annotation"]: 2894 param["annotation"]["annovar"] = {} 2895 2896 if "annotations" not in param["annotation"]["annovar"]: 2897 param["annotation"]["annovar"]["annotations"] = {} 2898 2899 # Options 2900 annotation_file_split = annotation_file.split(":") 2901 for annotation_file_annotation in annotation_file_split[1:]: 2902 if annotation_file_annotation: 2903 param["annotation"]["annovar"]["annotations"][ 2904 annotation_file_annotation 2905 ] = annotations 2906 2907 # Annotation Exomiser 2908 elif annotation_file.startswith("exomiser"): 2909 2910 log.debug(f"Quick Annotation Exomiser") 2911 2912 param["annotation"]["exomiser"] = params_string_to_dict( 2913 annotation_file 2914 ) 2915 2916 # Annotation Splice 2917 elif annotation_file.startswith("splice"): 2918 2919 log.debug(f"Quick Annotation Splice") 2920 2921 param["annotation"]["splice"] = params_string_to_dict( 2922 annotation_file 2923 ) 2924 2925 # Annotation Parquet or BCFTOOLS 2926 else: 2927 2928 # Tools detection 2929 if annotation_file.startswith("bcftools:"): 2930 annotation_tool_initial = "bcftools" 2931 annotation_file = ":".join(annotation_file.split(":")[1:]) 2932 elif annotation_file.startswith("snpsift:"): 2933 annotation_tool_initial = "snpsift" 2934 annotation_file = ":".join(annotation_file.split(":")[1:]) 2935 else: 2936 annotation_tool_initial = None 2937 2938 # list of files 2939 annotation_file_list = annotation_file.replace("+", ":").split( 2940 ":" 2941 ) 2942 2943 for annotation_file in annotation_file_list: 2944 2945 if annotation_file: 2946 2947 # Annotation tool initial 2948 annotation_tool = annotation_tool_initial 2949 2950 # Find file 2951 annotation_file_found = None 2952 2953 # Expand user 2954 annotation_file = full_path(annotation_file) 2955 2956 if os.path.exists(annotation_file): 2957 annotation_file_found = annotation_file 2958 2959 else: 2960 # Find within assembly folders 2961 for annotations_database in annotations_databases: 2962 found_files = find_all( 2963 annotation_file, 2964 os.path.join( 2965 annotations_database, assembly 2966 ), 2967 ) 2968 if len(found_files) > 0: 2969 annotation_file_found = found_files[0] 2970 break 2971 if not annotation_file_found and not assembly: 2972 # Find within folders 2973 for ( 2974 annotations_database 2975 ) in annotations_databases: 2976 found_files = find_all( 2977 annotation_file, annotations_database 2978 ) 2979 if len(found_files) > 0: 2980 annotation_file_found = found_files[0] 2981 break 2982 log.debug( 2983 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2984 ) 2985 2986 # Full path 2987 annotation_file_found = full_path(annotation_file_found) 2988 2989 if annotation_file_found: 2990 2991 database = Database(database=annotation_file_found) 2992 quick_annotation_format = database.get_format() 2993 quick_annotation_is_compressed = ( 2994 database.is_compressed() 2995 ) 2996 quick_annotation_is_indexed = os.path.exists( 2997 f"{annotation_file_found}.tbi" 2998 ) 2999 bcftools_preference = False 3000 3001 # Check Annotation Tool 3002 if not annotation_tool: 3003 if ( 3004 bcftools_preference 3005 and quick_annotation_format 3006 in ["vcf", "bed"] 3007 and quick_annotation_is_compressed 3008 and quick_annotation_is_indexed 3009 ): 3010 annotation_tool = "bcftools" 3011 elif quick_annotation_format in [ 3012 "vcf", 3013 "bed", 3014 "tsv", 3015 "tsv", 3016 "csv", 3017 "json", 3018 "tbl", 3019 "parquet", 3020 "duckdb", 3021 ]: 3022 annotation_tool = "parquet" 3023 else: 3024 log.error( 3025 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3026 ) 3027 raise ValueError( 3028 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3029 ) 3030 3031 log.debug( 3032 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3033 ) 3034 3035 # Annotation Tool dispatch 3036 if annotation_tool: 3037 if annotation_tool not in param["annotation"]: 3038 param["annotation"][annotation_tool] = {} 3039 if ( 3040 "annotations" 3041 not in param["annotation"][annotation_tool] 3042 ): 3043 param["annotation"][annotation_tool][ 3044 "annotations" 3045 ] = {} 3046 param["annotation"][annotation_tool][ 3047 "annotations" 3048 ][annotation_file_found] = annotations 3049 3050 else: 3051 log.error( 3052 f"Quick Annotation File {annotation_file} does NOT exist" 3053 ) 3054 3055 self.set_param(param) 3056 3057 if param.get("annotation", None): 3058 log.info("Annotations") 3059 if param.get("annotation", {}).get("parquet", None): 3060 log.info("Annotations 'parquet'...") 3061 self.annotation_parquet() 3062 if param.get("annotation", {}).get("bcftools", None): 3063 log.info("Annotations 'bcftools'...") 3064 self.annotation_bcftools() 3065 if param.get("annotation", {}).get("snpsift", None): 3066 log.info("Annotations 'snpsift'...") 3067 self.annotation_snpsift() 3068 if param.get("annotation", {}).get("annovar", None): 3069 log.info("Annotations 'annovar'...") 3070 self.annotation_annovar() 3071 if param.get("annotation", {}).get("snpeff", None): 3072 log.info("Annotations 'snpeff'...") 3073 self.annotation_snpeff() 3074 if param.get("annotation", {}).get("exomiser", None) is not None: 3075 log.info("Annotations 'exomiser'...") 3076 self.annotation_exomiser() 3077 if param.get("annotation", {}).get("splice", None) is not None: 3078 log.info("Annotations 'splice' ...") 3079 self.annotation_splice() 3080 3081 # Explode INFOS fields into table fields 3082 if self.get_explode_infos(): 3083 self.explode_infos( 3084 prefix=self.get_explode_infos_prefix(), 3085 fields=self.get_explode_infos_fields(), 3086 force=True, 3087 ) 3088 3089 def annotation_snpsift(self, threads: int = None) -> None: 3090 """ 3091 This function annotate with bcftools 3092 3093 :param threads: Number of threads to use 3094 :return: the value of the variable "return_value". 3095 """ 3096 3097 # DEBUG 3098 log.debug("Start annotation with bcftools databases") 3099 3100 # Threads 3101 if not threads: 3102 threads = self.get_threads() 3103 log.debug("Threads: " + str(threads)) 3104 3105 # Config 3106 config = self.get_config() 3107 log.debug("Config: " + str(config)) 3108 3109 # Config - snpSift 3110 snpsift_bin_command = get_bin_command( 3111 bin="SnpSift.jar", 3112 tool="snpsift", 3113 bin_type="jar", 3114 config=config, 3115 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3116 ) 3117 if not snpsift_bin_command: 3118 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3119 log.error(msg_err) 3120 raise ValueError(msg_err) 3121 3122 # Config - bcftools 3123 bcftools_bin_command = get_bin_command( 3124 bin="bcftools", 3125 tool="bcftools", 3126 bin_type="bin", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3129 ) 3130 if not bcftools_bin_command: 3131 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - BCFTools databases folders 3136 databases_folders = set( 3137 self.get_config() 3138 .get("folders", {}) 3139 .get("databases", {}) 3140 .get("annotations", ["."]) 3141 + self.get_config() 3142 .get("folders", {}) 3143 .get("databases", {}) 3144 .get("bcftools", ["."]) 3145 ) 3146 log.debug("Databases annotations: " + str(databases_folders)) 3147 3148 # Param 3149 annotations = ( 3150 self.get_param() 3151 .get("annotation", {}) 3152 .get("snpsift", {}) 3153 .get("annotations", None) 3154 ) 3155 log.debug("Annotations: " + str(annotations)) 3156 3157 # Assembly 3158 assembly = self.get_param().get( 3159 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3160 ) 3161 3162 # Data 3163 table_variants = self.get_table_variants() 3164 3165 # Check if not empty 3166 log.debug("Check if not empty") 3167 sql_query_chromosomes = ( 3168 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3169 ) 3170 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3171 if not sql_query_chromosomes_df["count"][0]: 3172 log.info(f"VCF empty") 3173 return 3174 3175 # VCF header 3176 vcf_reader = self.get_header() 3177 log.debug("Initial header: " + str(vcf_reader.infos)) 3178 3179 # Existing annotations 3180 for vcf_annotation in self.get_header().infos: 3181 3182 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3183 log.debug( 3184 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3185 ) 3186 3187 if annotations: 3188 3189 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3190 3191 # Export VCF file 3192 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3193 3194 # Init 3195 commands = {} 3196 3197 for annotation in annotations: 3198 annotation_fields = annotations[annotation] 3199 3200 # Annotation Name 3201 annotation_name = os.path.basename(annotation) 3202 3203 if not annotation_fields: 3204 annotation_fields = {"INFO": None} 3205 3206 log.debug(f"Annotation '{annotation_name}'") 3207 log.debug( 3208 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3209 ) 3210 3211 # Create Database 3212 database = Database( 3213 database=annotation, 3214 databases_folders=databases_folders, 3215 assembly=assembly, 3216 ) 3217 3218 # Find files 3219 db_file = database.get_database() 3220 db_file = full_path(db_file) 3221 db_hdr_file = database.get_header_file() 3222 db_hdr_file = full_path(db_hdr_file) 3223 db_file_type = database.get_format() 3224 db_tbi_file = f"{db_file}.tbi" 3225 db_file_compressed = database.is_compressed() 3226 3227 # Check if compressed 3228 if not db_file_compressed: 3229 log.error( 3230 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3231 ) 3232 raise ValueError( 3233 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3234 ) 3235 3236 # Check if indexed 3237 if not os.path.exists(db_tbi_file): 3238 log.error( 3239 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3240 ) 3241 raise ValueError( 3242 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3243 ) 3244 3245 # Check index - try to create if not exists 3246 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3247 log.error("Annotation failed: database not valid") 3248 log.error(f"Annotation annotation file: {db_file}") 3249 log.error(f"Annotation annotation header: {db_hdr_file}") 3250 log.error(f"Annotation annotation index: {db_tbi_file}") 3251 raise ValueError( 3252 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3253 ) 3254 else: 3255 3256 log.debug( 3257 f"Annotation '{annotation}' - file: " 3258 + str(db_file) 3259 + " and " 3260 + str(db_hdr_file) 3261 ) 3262 3263 # Load header as VCF object 3264 db_hdr_vcf = Variants(input=db_hdr_file) 3265 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3266 log.debug( 3267 "Annotation database header: " 3268 + str(db_hdr_vcf_header_infos) 3269 ) 3270 3271 # For all fields in database 3272 annotation_fields_full = False 3273 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3274 annotation_fields = { 3275 key: key for key in db_hdr_vcf_header_infos 3276 } 3277 log.debug( 3278 "Annotation database header - All annotations added: " 3279 + str(annotation_fields) 3280 ) 3281 annotation_fields_full = True 3282 3283 # # Create file for field rename 3284 # log.debug("Create file for field rename") 3285 # tmp_rename = NamedTemporaryFile( 3286 # prefix=self.get_prefix(), 3287 # dir=self.get_tmp_dir(), 3288 # suffix=".rename", 3289 # delete=False, 3290 # ) 3291 # tmp_rename_name = tmp_rename.name 3292 # tmp_files.append(tmp_rename_name) 3293 3294 # Number of fields 3295 nb_annotation_field = 0 3296 annotation_list = [] 3297 annotation_infos_rename_list = [] 3298 3299 for annotation_field in annotation_fields: 3300 3301 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3302 annotation_fields_new_name = annotation_fields.get( 3303 annotation_field, annotation_field 3304 ) 3305 if not annotation_fields_new_name: 3306 annotation_fields_new_name = annotation_field 3307 3308 # Check if field is in DB and if field is not elready in input data 3309 if ( 3310 annotation_field in db_hdr_vcf.get_header().infos 3311 and annotation_fields_new_name 3312 not in self.get_header().infos 3313 ): 3314 3315 log.info( 3316 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3317 ) 3318 3319 # BCFTools annotate param to rename fields 3320 if annotation_field != annotation_fields_new_name: 3321 annotation_infos_rename_list.append( 3322 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3323 ) 3324 3325 # Add INFO field to header 3326 db_hdr_vcf_header_infos_number = ( 3327 db_hdr_vcf_header_infos[annotation_field].num or "." 3328 ) 3329 db_hdr_vcf_header_infos_type = ( 3330 db_hdr_vcf_header_infos[annotation_field].type 3331 or "String" 3332 ) 3333 db_hdr_vcf_header_infos_description = ( 3334 db_hdr_vcf_header_infos[annotation_field].desc 3335 or f"{annotation_field} description" 3336 ) 3337 db_hdr_vcf_header_infos_source = ( 3338 db_hdr_vcf_header_infos[annotation_field].source 3339 or "unknown" 3340 ) 3341 db_hdr_vcf_header_infos_version = ( 3342 db_hdr_vcf_header_infos[annotation_field].version 3343 or "unknown" 3344 ) 3345 3346 vcf_reader.infos[annotation_fields_new_name] = ( 3347 vcf.parser._Info( 3348 annotation_fields_new_name, 3349 db_hdr_vcf_header_infos_number, 3350 db_hdr_vcf_header_infos_type, 3351 db_hdr_vcf_header_infos_description, 3352 db_hdr_vcf_header_infos_source, 3353 db_hdr_vcf_header_infos_version, 3354 self.code_type_map[ 3355 db_hdr_vcf_header_infos_type 3356 ], 3357 ) 3358 ) 3359 3360 annotation_list.append(annotation_field) 3361 3362 nb_annotation_field += 1 3363 3364 else: 3365 3366 if ( 3367 annotation_field 3368 not in db_hdr_vcf.get_header().infos 3369 ): 3370 log.warning( 3371 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3372 ) 3373 if ( 3374 annotation_fields_new_name 3375 in self.get_header().infos 3376 ): 3377 log.warning( 3378 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3379 ) 3380 3381 log.info( 3382 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3383 ) 3384 3385 annotation_infos = ",".join(annotation_list) 3386 3387 if annotation_infos != "": 3388 3389 # Annotated VCF (and error file) 3390 tmp_annotation_vcf_name = os.path.join( 3391 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3392 ) 3393 tmp_annotation_vcf_name_err = ( 3394 tmp_annotation_vcf_name + ".err" 3395 ) 3396 3397 # Add fields to annotate 3398 if not annotation_fields_full: 3399 annotation_infos_option = f"-info {annotation_infos}" 3400 else: 3401 annotation_infos_option = "" 3402 3403 # Info fields rename 3404 if annotation_infos_rename_list: 3405 annotation_infos_rename = " -c " + ",".join( 3406 annotation_infos_rename_list 3407 ) 3408 else: 3409 annotation_infos_rename = "" 3410 3411 # Annotate command 3412 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3413 3414 # Add command 3415 commands[command_annotate] = tmp_annotation_vcf_name 3416 3417 if commands: 3418 3419 # Export VCF file 3420 self.export_variant_vcf( 3421 vcf_file=tmp_vcf_name, 3422 remove_info=True, 3423 add_samples=False, 3424 index=True, 3425 ) 3426 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3427 3428 # Num command 3429 nb_command = 0 3430 3431 # Annotate 3432 for command_annotate in commands: 3433 nb_command += 1 3434 log.info( 3435 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3436 ) 3437 log.debug(f"command_annotate={command_annotate}") 3438 run_parallel_commands([command_annotate], threads) 3439 3440 # Debug 3441 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3442 3443 # Update variants 3444 log.info( 3445 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3446 ) 3447 self.update_from_vcf(commands[command_annotate]) 3448 3449 def annotation_bcftools(self, threads: int = None) -> None: 3450 """ 3451 This function annotate with bcftools 3452 3453 :param threads: Number of threads to use 3454 :return: the value of the variable "return_value". 3455 """ 3456 3457 # DEBUG 3458 log.debug("Start annotation with bcftools databases") 3459 3460 # Threads 3461 if not threads: 3462 threads = self.get_threads() 3463 log.debug("Threads: " + str(threads)) 3464 3465 # Config 3466 config = self.get_config() 3467 log.debug("Config: " + str(config)) 3468 3469 # DEBUG 3470 delete_tmp = True 3471 if self.get_config().get("verbosity", "warning") in ["debug"]: 3472 delete_tmp = False 3473 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3474 3475 # Config - BCFTools bin command 3476 bcftools_bin_command = get_bin_command( 3477 bin="bcftools", 3478 tool="bcftools", 3479 bin_type="bin", 3480 config=config, 3481 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3482 ) 3483 if not bcftools_bin_command: 3484 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Config - BCFTools databases folders 3489 databases_folders = set( 3490 self.get_config() 3491 .get("folders", {}) 3492 .get("databases", {}) 3493 .get("annotations", ["."]) 3494 + self.get_config() 3495 .get("folders", {}) 3496 .get("databases", {}) 3497 .get("bcftools", ["."]) 3498 ) 3499 log.debug("Databases annotations: " + str(databases_folders)) 3500 3501 # Param 3502 annotations = ( 3503 self.get_param() 3504 .get("annotation", {}) 3505 .get("bcftools", {}) 3506 .get("annotations", None) 3507 ) 3508 log.debug("Annotations: " + str(annotations)) 3509 3510 # Assembly 3511 assembly = self.get_param().get( 3512 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3513 ) 3514 3515 # Data 3516 table_variants = self.get_table_variants() 3517 3518 # Check if not empty 3519 log.debug("Check if not empty") 3520 sql_query_chromosomes = ( 3521 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3522 ) 3523 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3524 if not sql_query_chromosomes_df["count"][0]: 3525 log.info(f"VCF empty") 3526 return 3527 3528 # Export in VCF 3529 log.debug("Create initial file to annotate") 3530 tmp_vcf = NamedTemporaryFile( 3531 prefix=self.get_prefix(), 3532 dir=self.get_tmp_dir(), 3533 suffix=".vcf.gz", 3534 delete=False, 3535 ) 3536 tmp_vcf_name = tmp_vcf.name 3537 3538 # VCF header 3539 vcf_reader = self.get_header() 3540 log.debug("Initial header: " + str(vcf_reader.infos)) 3541 3542 # Existing annotations 3543 for vcf_annotation in self.get_header().infos: 3544 3545 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3546 log.debug( 3547 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3548 ) 3549 3550 if annotations: 3551 3552 tmp_ann_vcf_list = [] 3553 commands = [] 3554 tmp_files = [] 3555 err_files = [] 3556 3557 for annotation in annotations: 3558 annotation_fields = annotations[annotation] 3559 3560 # Annotation Name 3561 annotation_name = os.path.basename(annotation) 3562 3563 if not annotation_fields: 3564 annotation_fields = {"INFO": None} 3565 3566 log.debug(f"Annotation '{annotation_name}'") 3567 log.debug( 3568 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3569 ) 3570 3571 # Create Database 3572 database = Database( 3573 database=annotation, 3574 databases_folders=databases_folders, 3575 assembly=assembly, 3576 ) 3577 3578 # Find files 3579 db_file = database.get_database() 3580 db_file = full_path(db_file) 3581 db_hdr_file = database.get_header_file() 3582 db_hdr_file = full_path(db_hdr_file) 3583 db_file_type = database.get_format() 3584 db_tbi_file = f"{db_file}.tbi" 3585 db_file_compressed = database.is_compressed() 3586 3587 # Check if compressed 3588 if not db_file_compressed: 3589 log.error( 3590 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3591 ) 3592 raise ValueError( 3593 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3594 ) 3595 3596 # Check if indexed 3597 if not os.path.exists(db_tbi_file): 3598 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3599 raise ValueError( 3600 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3601 ) 3602 3603 # Check index - try to create if not exists 3604 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3605 log.error("Annotation failed: database not valid") 3606 log.error(f"Annotation annotation file: {db_file}") 3607 log.error(f"Annotation annotation header: {db_hdr_file}") 3608 log.error(f"Annotation annotation index: {db_tbi_file}") 3609 raise ValueError( 3610 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3611 ) 3612 else: 3613 3614 log.debug( 3615 f"Annotation '{annotation}' - file: " 3616 + str(db_file) 3617 + " and " 3618 + str(db_hdr_file) 3619 ) 3620 3621 # Load header as VCF object 3622 db_hdr_vcf = Variants(input=db_hdr_file) 3623 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3624 log.debug( 3625 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3626 ) 3627 3628 # For all fields in database 3629 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3630 annotation_fields = { 3631 key: key for key in db_hdr_vcf_header_infos 3632 } 3633 log.debug( 3634 "Annotation database header - All annotations added: " 3635 + str(annotation_fields) 3636 ) 3637 3638 # Number of fields 3639 nb_annotation_field = 0 3640 annotation_list = [] 3641 3642 for annotation_field in annotation_fields: 3643 3644 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3645 annotation_fields_new_name = annotation_fields.get( 3646 annotation_field, annotation_field 3647 ) 3648 if not annotation_fields_new_name: 3649 annotation_fields_new_name = annotation_field 3650 3651 # Check if field is in DB and if field is not elready in input data 3652 if ( 3653 annotation_field in db_hdr_vcf.get_header().infos 3654 and annotation_fields_new_name 3655 not in self.get_header().infos 3656 ): 3657 3658 log.info( 3659 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3660 ) 3661 3662 # Add INFO field to header 3663 db_hdr_vcf_header_infos_number = ( 3664 db_hdr_vcf_header_infos[annotation_field].num or "." 3665 ) 3666 db_hdr_vcf_header_infos_type = ( 3667 db_hdr_vcf_header_infos[annotation_field].type 3668 or "String" 3669 ) 3670 db_hdr_vcf_header_infos_description = ( 3671 db_hdr_vcf_header_infos[annotation_field].desc 3672 or f"{annotation_field} description" 3673 ) 3674 db_hdr_vcf_header_infos_source = ( 3675 db_hdr_vcf_header_infos[annotation_field].source 3676 or "unknown" 3677 ) 3678 db_hdr_vcf_header_infos_version = ( 3679 db_hdr_vcf_header_infos[annotation_field].version 3680 or "unknown" 3681 ) 3682 3683 vcf_reader.infos[annotation_fields_new_name] = ( 3684 vcf.parser._Info( 3685 annotation_fields_new_name, 3686 db_hdr_vcf_header_infos_number, 3687 db_hdr_vcf_header_infos_type, 3688 db_hdr_vcf_header_infos_description, 3689 db_hdr_vcf_header_infos_source, 3690 db_hdr_vcf_header_infos_version, 3691 self.code_type_map[db_hdr_vcf_header_infos_type], 3692 ) 3693 ) 3694 3695 # annotation_list.append(annotation_field) 3696 if annotation_field != annotation_fields_new_name: 3697 annotation_list.append( 3698 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3699 ) 3700 else: 3701 annotation_list.append(annotation_field) 3702 3703 nb_annotation_field += 1 3704 3705 else: 3706 3707 if annotation_field not in db_hdr_vcf.get_header().infos: 3708 log.warning( 3709 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3710 ) 3711 if annotation_fields_new_name in self.get_header().infos: 3712 log.warning( 3713 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3714 ) 3715 3716 log.info( 3717 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3718 ) 3719 3720 annotation_infos = ",".join(annotation_list) 3721 3722 if annotation_infos != "": 3723 3724 # Protect header for bcftools (remove "#CHROM" and variants line) 3725 log.debug("Protect Header file - remove #CHROM line if exists") 3726 tmp_header_vcf = NamedTemporaryFile( 3727 prefix=self.get_prefix(), 3728 dir=self.get_tmp_dir(), 3729 suffix=".hdr", 3730 delete=False, 3731 ) 3732 tmp_header_vcf_name = tmp_header_vcf.name 3733 tmp_files.append(tmp_header_vcf_name) 3734 # Command 3735 if db_hdr_file.endswith(".gz"): 3736 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3737 else: 3738 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3739 # Run 3740 run_parallel_commands([command_extract_header], 1) 3741 3742 # Find chomosomes 3743 log.debug("Find chromosomes ") 3744 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3745 sql_query_chromosomes_df = self.get_query_to_df( 3746 sql_query_chromosomes 3747 ) 3748 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3749 3750 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3751 3752 # BED columns in the annotation file 3753 if db_file_type in ["bed"]: 3754 annotation_infos = "CHROM,POS,POS," + annotation_infos 3755 3756 for chrom in chomosomes_list: 3757 3758 # Create BED on initial VCF 3759 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3760 tmp_bed = NamedTemporaryFile( 3761 prefix=self.get_prefix(), 3762 dir=self.get_tmp_dir(), 3763 suffix=".bed", 3764 delete=False, 3765 ) 3766 tmp_bed_name = tmp_bed.name 3767 tmp_files.append(tmp_bed_name) 3768 3769 # Detecte regions 3770 log.debug( 3771 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3772 ) 3773 window = 1000000 3774 sql_query_intervals_for_bed = f""" 3775 SELECT \"#CHROM\", 3776 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3777 \"POS\"+{window} 3778 FROM {table_variants} as table_variants 3779 WHERE table_variants.\"#CHROM\" = '{chrom}' 3780 """ 3781 regions = self.conn.execute( 3782 sql_query_intervals_for_bed 3783 ).fetchall() 3784 merged_regions = merge_regions(regions) 3785 log.debug( 3786 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3787 ) 3788 3789 header = ["#CHROM", "START", "END"] 3790 with open(tmp_bed_name, "w") as f: 3791 # Write the header with tab delimiter 3792 f.write("\t".join(header) + "\n") 3793 for d in merged_regions: 3794 # Write each data row with tab delimiter 3795 f.write("\t".join(map(str, d)) + "\n") 3796 3797 # Tmp files 3798 tmp_annotation_vcf = NamedTemporaryFile( 3799 prefix=self.get_prefix(), 3800 dir=self.get_tmp_dir(), 3801 suffix=".vcf.gz", 3802 delete=False, 3803 ) 3804 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3805 tmp_files.append(tmp_annotation_vcf_name) 3806 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3807 tmp_annotation_vcf_name_err = ( 3808 tmp_annotation_vcf_name + ".err" 3809 ) 3810 err_files.append(tmp_annotation_vcf_name_err) 3811 3812 # Annotate Command 3813 log.debug( 3814 f"Annotation '{annotation}' - add bcftools command" 3815 ) 3816 3817 # Command 3818 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3819 3820 # Add command 3821 commands.append(command_annotate) 3822 3823 # if some commands 3824 if commands: 3825 3826 # Export VCF file 3827 self.export_variant_vcf( 3828 vcf_file=tmp_vcf_name, 3829 remove_info=True, 3830 add_samples=False, 3831 index=True, 3832 ) 3833 3834 # Threads 3835 # calculate threads for annotated commands 3836 if commands: 3837 threads_bcftools_annotate = round(threads / len(commands)) 3838 else: 3839 threads_bcftools_annotate = 1 3840 3841 if not threads_bcftools_annotate: 3842 threads_bcftools_annotate = 1 3843 3844 # Add threads option to bcftools commands 3845 if threads_bcftools_annotate > 1: 3846 commands_threaded = [] 3847 for command in commands: 3848 commands_threaded.append( 3849 command.replace( 3850 f"{bcftools_bin_command} annotate ", 3851 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3852 ) 3853 ) 3854 commands = commands_threaded 3855 3856 # Command annotation multithreading 3857 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3858 log.info( 3859 f"Annotation - Annotation multithreaded in " 3860 + str(len(commands)) 3861 + " commands" 3862 ) 3863 3864 run_parallel_commands(commands, threads) 3865 3866 # Merge 3867 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3868 3869 if tmp_ann_vcf_list_cmd: 3870 3871 # Tmp file 3872 tmp_annotate_vcf = NamedTemporaryFile( 3873 prefix=self.get_prefix(), 3874 dir=self.get_tmp_dir(), 3875 suffix=".vcf.gz", 3876 delete=True, 3877 ) 3878 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3879 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3880 err_files.append(tmp_annotate_vcf_name_err) 3881 3882 # Tmp file remove command 3883 tmp_files_remove_command = "" 3884 if tmp_files: 3885 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3886 3887 # Command merge 3888 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3889 log.info( 3890 f"Annotation - Annotation merging " 3891 + str(len(commands)) 3892 + " annotated files" 3893 ) 3894 log.debug(f"Annotation - merge command: {merge_command}") 3895 run_parallel_commands([merge_command], 1) 3896 3897 # Error messages 3898 log.info(f"Error/Warning messages:") 3899 error_message_command_all = [] 3900 error_message_command_warning = [] 3901 error_message_command_err = [] 3902 for err_file in err_files: 3903 with open(err_file, "r") as f: 3904 for line in f: 3905 message = line.strip() 3906 error_message_command_all.append(message) 3907 if line.startswith("[W::"): 3908 error_message_command_warning.append(message) 3909 if line.startswith("[E::"): 3910 error_message_command_err.append( 3911 f"{err_file}: " + message 3912 ) 3913 # log info 3914 for message in list( 3915 set(error_message_command_err + error_message_command_warning) 3916 ): 3917 log.info(f" {message}") 3918 # debug info 3919 for message in list(set(error_message_command_all)): 3920 log.debug(f" {message}") 3921 # failed 3922 if len(error_message_command_err): 3923 log.error("Annotation failed: Error in commands") 3924 raise ValueError("Annotation failed: Error in commands") 3925 3926 # Update variants 3927 log.info(f"Annotation - Updating...") 3928 self.update_from_vcf(tmp_annotate_vcf_name) 3929 3930 def annotation_exomiser(self, threads: int = None) -> None: 3931 """ 3932 This function annotate with Exomiser 3933 3934 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3935 - "analysis" (dict/file): 3936 Full analysis dictionnary parameters (see Exomiser docs). 3937 Either a dict, or a file in JSON or YAML format. 3938 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3939 Default : None 3940 - "preset" (string): 3941 Analysis preset (available in config folder). 3942 Used if no full "analysis" is provided. 3943 Default: "exome" 3944 - "phenopacket" (dict/file): 3945 Samples and phenotipic features parameters (see Exomiser docs). 3946 Either a dict, or a file in JSON or YAML format. 3947 Default: None 3948 - "subject" (dict): 3949 Sample parameters (see Exomiser docs). 3950 Example: 3951 "subject": 3952 { 3953 "id": "ISDBM322017", 3954 "sex": "FEMALE" 3955 } 3956 Default: None 3957 - "sample" (string): 3958 Sample name to construct "subject" section: 3959 "subject": 3960 { 3961 "id": "<sample>", 3962 "sex": "UNKNOWN_SEX" 3963 } 3964 Default: None 3965 - "phenotypicFeatures" (dict) 3966 Phenotypic features to construct "subject" section. 3967 Example: 3968 "phenotypicFeatures": 3969 [ 3970 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3971 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3972 ] 3973 - "hpo" (list) 3974 List of HPO ids as phenotypic features. 3975 Example: 3976 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3977 Default: [] 3978 - "outputOptions" (dict): 3979 Output options (see Exomiser docs). 3980 Default: 3981 "output_options" = 3982 { 3983 "outputContributingVariantsOnly": False, 3984 "numGenes": 0, 3985 "outputFormats": ["TSV_VARIANT", "VCF"] 3986 } 3987 - "transcript_source" (string): 3988 Transcript source (either "refseq", "ucsc", "ensembl") 3989 Default: "refseq" 3990 - "exomiser_to_info" (boolean): 3991 Add exomiser TSV file columns as INFO fields in VCF. 3992 Default: False 3993 - "release" (string): 3994 Exomise database release. 3995 If not exists, database release will be downloaded (take a while). 3996 Default: None (provided by application.properties configuration file) 3997 - "exomiser_application_properties" (file): 3998 Exomiser configuration file (see Exomiser docs). 3999 Useful to automatically download databases (especially for specific genome databases). 4000 4001 Notes: 4002 - If no sample in parameters, first sample in VCF will be chosen 4003 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4004 4005 :param threads: The number of threads to use 4006 :return: None. 4007 """ 4008 4009 # DEBUG 4010 log.debug("Start annotation with Exomiser databases") 4011 4012 # Threads 4013 if not threads: 4014 threads = self.get_threads() 4015 log.debug("Threads: " + str(threads)) 4016 4017 # Config 4018 config = self.get_config() 4019 log.debug("Config: " + str(config)) 4020 4021 # Config - Folders - Databases 4022 databases_folders = ( 4023 config.get("folders", {}) 4024 .get("databases", {}) 4025 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4026 ) 4027 databases_folders = full_path(databases_folders) 4028 if not os.path.exists(databases_folders): 4029 log.error(f"Databases annotations: {databases_folders} NOT found") 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Config - Exomiser 4033 exomiser_bin_command = get_bin_command( 4034 bin="exomiser-cli*.jar", 4035 tool="exomiser", 4036 bin_type="jar", 4037 config=config, 4038 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4039 ) 4040 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4041 if not exomiser_bin_command: 4042 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4043 log.error(msg_err) 4044 raise ValueError(msg_err) 4045 4046 # Param 4047 param = self.get_param() 4048 log.debug("Param: " + str(param)) 4049 4050 # Param - Exomiser 4051 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4052 log.debug(f"Param Exomiser: {param_exomiser}") 4053 4054 # Param - Assembly 4055 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4056 log.debug("Assembly: " + str(assembly)) 4057 4058 # Data 4059 table_variants = self.get_table_variants() 4060 4061 # Check if not empty 4062 log.debug("Check if not empty") 4063 sql_query_chromosomes = ( 4064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4065 ) 4066 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4067 log.info(f"VCF empty") 4068 return False 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Samples 4075 samples = self.get_header_sample_list() 4076 if not samples: 4077 log.error("No Samples in VCF") 4078 return False 4079 log.debug(f"Samples: {samples}") 4080 4081 # Memory limit 4082 memory_limit = self.get_memory("8G") 4083 log.debug(f"memory_limit: {memory_limit}") 4084 4085 # Exomiser java options 4086 exomiser_java_options = ( 4087 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4088 ) 4089 log.debug(f"Exomiser java options: {exomiser_java_options}") 4090 4091 # Download Exomiser (if not exists) 4092 exomiser_release = param_exomiser.get("release", None) 4093 exomiser_application_properties = param_exomiser.get( 4094 "exomiser_application_properties", None 4095 ) 4096 databases_download_exomiser( 4097 assemblies=[assembly], 4098 exomiser_folder=databases_folders, 4099 exomiser_release=exomiser_release, 4100 exomiser_phenotype_release=exomiser_release, 4101 exomiser_application_properties=exomiser_application_properties, 4102 ) 4103 4104 # Force annotation 4105 force_update_annotation = True 4106 4107 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4108 log.debug("Start annotation Exomiser") 4109 4110 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4111 4112 # tmp_dir = "/tmp/exomiser" 4113 4114 ### ANALYSIS ### 4115 ################ 4116 4117 # Create analysis.json through analysis dict 4118 # either analysis in param or by default 4119 # depending on preset exome/genome) 4120 4121 # Init analysis dict 4122 param_exomiser_analysis_dict = {} 4123 4124 # analysis from param 4125 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4126 param_exomiser_analysis = full_path(param_exomiser_analysis) 4127 4128 # If analysis in param -> load anlaysis json 4129 if param_exomiser_analysis: 4130 4131 # If param analysis is a file and exists 4132 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4133 param_exomiser_analysis 4134 ): 4135 # Load analysis file into analysis dict (either yaml or json) 4136 with open(param_exomiser_analysis) as json_file: 4137 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4138 4139 # If param analysis is a dict 4140 elif isinstance(param_exomiser_analysis, dict): 4141 # Load analysis dict into analysis dict (either yaml or json) 4142 param_exomiser_analysis_dict = param_exomiser_analysis 4143 4144 # Error analysis type 4145 else: 4146 log.error(f"Analysis type unknown. Check param file.") 4147 raise ValueError(f"Analysis type unknown. Check param file.") 4148 4149 # Case no input analysis config file/dict 4150 # Use preset (exome/genome) to open default config file 4151 if not param_exomiser_analysis_dict: 4152 4153 # default preset 4154 default_preset = "exome" 4155 4156 # Get param preset or default preset 4157 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4158 4159 # Try to find if preset is a file 4160 if os.path.exists(param_exomiser_preset): 4161 # Preset file is provided in full path 4162 param_exomiser_analysis_default_config_file = ( 4163 param_exomiser_preset 4164 ) 4165 # elif os.path.exists(full_path(param_exomiser_preset)): 4166 # # Preset file is provided in full path 4167 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4168 elif os.path.exists( 4169 os.path.join(folder_config, param_exomiser_preset) 4170 ): 4171 # Preset file is provided a basename in config folder (can be a path with subfolders) 4172 param_exomiser_analysis_default_config_file = os.path.join( 4173 folder_config, param_exomiser_preset 4174 ) 4175 else: 4176 # Construct preset file 4177 param_exomiser_analysis_default_config_file = os.path.join( 4178 folder_config, 4179 f"preset-{param_exomiser_preset}-analysis.json", 4180 ) 4181 4182 # If preset file exists 4183 param_exomiser_analysis_default_config_file = full_path( 4184 param_exomiser_analysis_default_config_file 4185 ) 4186 if os.path.exists(param_exomiser_analysis_default_config_file): 4187 # Load prest file into analysis dict (either yaml or json) 4188 with open( 4189 param_exomiser_analysis_default_config_file 4190 ) as json_file: 4191 # param_exomiser_analysis_dict[""] = json.load(json_file) 4192 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4193 json_file 4194 ) 4195 4196 # Error preset file 4197 else: 4198 log.error( 4199 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4200 ) 4201 raise ValueError( 4202 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4203 ) 4204 4205 # If no analysis dict created 4206 if not param_exomiser_analysis_dict: 4207 log.error(f"No analysis config") 4208 raise ValueError(f"No analysis config") 4209 4210 # Log 4211 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4212 4213 ### PHENOPACKET ### 4214 ################### 4215 4216 # If no PhenoPacket in analysis dict -> check in param 4217 if "phenopacket" not in param_exomiser_analysis_dict: 4218 4219 # If PhenoPacket in param -> load anlaysis json 4220 if param_exomiser.get("phenopacket", None): 4221 4222 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4223 param_exomiser_phenopacket = full_path( 4224 param_exomiser_phenopacket 4225 ) 4226 4227 # If param phenopacket is a file and exists 4228 if isinstance( 4229 param_exomiser_phenopacket, str 4230 ) and os.path.exists(param_exomiser_phenopacket): 4231 # Load phenopacket file into analysis dict (either yaml or json) 4232 with open(param_exomiser_phenopacket) as json_file: 4233 param_exomiser_analysis_dict["phenopacket"] = ( 4234 yaml.safe_load(json_file) 4235 ) 4236 4237 # If param phenopacket is a dict 4238 elif isinstance(param_exomiser_phenopacket, dict): 4239 # Load phenopacket dict into analysis dict (either yaml or json) 4240 param_exomiser_analysis_dict["phenopacket"] = ( 4241 param_exomiser_phenopacket 4242 ) 4243 4244 # Error phenopacket type 4245 else: 4246 log.error(f"Phenopacket type unknown. Check param file.") 4247 raise ValueError( 4248 f"Phenopacket type unknown. Check param file." 4249 ) 4250 4251 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4252 if "phenopacket" not in param_exomiser_analysis_dict: 4253 4254 # Init PhenoPacket 4255 param_exomiser_analysis_dict["phenopacket"] = { 4256 "id": "analysis", 4257 "proband": {}, 4258 } 4259 4260 ### Add subject ### 4261 4262 # If subject exists 4263 param_exomiser_subject = param_exomiser.get("subject", {}) 4264 4265 # If subject not exists -> found sample ID 4266 if not param_exomiser_subject: 4267 4268 # Found sample ID in param 4269 sample = param_exomiser.get("sample", None) 4270 4271 # Find sample ID (first sample) 4272 if not sample: 4273 sample_list = self.get_header_sample_list() 4274 if len(sample_list) > 0: 4275 sample = sample_list[0] 4276 else: 4277 log.error(f"No sample found") 4278 raise ValueError(f"No sample found") 4279 4280 # Create subject 4281 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4282 4283 # Add to dict 4284 param_exomiser_analysis_dict["phenopacket"][ 4285 "subject" 4286 ] = param_exomiser_subject 4287 4288 ### Add "phenotypicFeatures" ### 4289 4290 # If phenotypicFeatures exists 4291 param_exomiser_phenotypicfeatures = param_exomiser.get( 4292 "phenotypicFeatures", [] 4293 ) 4294 4295 # If phenotypicFeatures not exists -> Try to infer from hpo list 4296 if not param_exomiser_phenotypicfeatures: 4297 4298 # Found HPO in param 4299 param_exomiser_hpo = param_exomiser.get("hpo", []) 4300 4301 # Split HPO if list in string format separated by comma 4302 if isinstance(param_exomiser_hpo, str): 4303 param_exomiser_hpo = param_exomiser_hpo.split(",") 4304 4305 # Create HPO list 4306 for hpo in param_exomiser_hpo: 4307 hpo_clean = re.sub("[^0-9]", "", hpo) 4308 param_exomiser_phenotypicfeatures.append( 4309 { 4310 "type": { 4311 "id": f"HP:{hpo_clean}", 4312 "label": f"HP:{hpo_clean}", 4313 } 4314 } 4315 ) 4316 4317 # Add to dict 4318 param_exomiser_analysis_dict["phenopacket"][ 4319 "phenotypicFeatures" 4320 ] = param_exomiser_phenotypicfeatures 4321 4322 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4323 if not param_exomiser_phenotypicfeatures: 4324 for step in param_exomiser_analysis_dict.get( 4325 "analysis", {} 4326 ).get("steps", []): 4327 if "hiPhivePrioritiser" in step: 4328 param_exomiser_analysis_dict.get("analysis", {}).get( 4329 "steps", [] 4330 ).remove(step) 4331 4332 ### Add Input File ### 4333 4334 # Initial file name and htsFiles 4335 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4336 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4337 { 4338 "uri": tmp_vcf_name, 4339 "htsFormat": "VCF", 4340 "genomeAssembly": assembly, 4341 } 4342 ] 4343 4344 ### Add metaData ### 4345 4346 # If metaData not in analysis dict 4347 if "metaData" not in param_exomiser_analysis_dict: 4348 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4349 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4350 "createdBy": "howard", 4351 "phenopacketSchemaVersion": 1, 4352 } 4353 4354 ### OutputOptions ### 4355 4356 # Init output result folder 4357 output_results = os.path.join(tmp_dir, "results") 4358 4359 # If no outputOptions in analysis dict 4360 if "outputOptions" not in param_exomiser_analysis_dict: 4361 4362 # default output formats 4363 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4364 4365 # Get outputOptions in param 4366 output_options = param_exomiser.get("outputOptions", None) 4367 4368 # If no output_options in param -> check 4369 if not output_options: 4370 output_options = { 4371 "outputContributingVariantsOnly": False, 4372 "numGenes": 0, 4373 "outputFormats": defaut_output_formats, 4374 } 4375 4376 # Replace outputDirectory in output options 4377 output_options["outputDirectory"] = output_results 4378 output_options["outputFileName"] = "howard" 4379 4380 # Add outputOptions in analysis dict 4381 param_exomiser_analysis_dict["outputOptions"] = output_options 4382 4383 else: 4384 4385 # Replace output_results and output format (if exists in param) 4386 param_exomiser_analysis_dict["outputOptions"][ 4387 "outputDirectory" 4388 ] = output_results 4389 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4390 list( 4391 set( 4392 param_exomiser_analysis_dict.get( 4393 "outputOptions", {} 4394 ).get("outputFormats", []) 4395 + ["TSV_VARIANT", "VCF"] 4396 ) 4397 ) 4398 ) 4399 4400 # log 4401 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4402 4403 ### ANALYSIS FILE ### 4404 ##################### 4405 4406 ### Full JSON analysis config file ### 4407 4408 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4409 with open(exomiser_analysis, "w") as fp: 4410 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4411 4412 ### SPLIT analysis and sample config files 4413 4414 # Splitted analysis dict 4415 param_exomiser_analysis_dict_for_split = ( 4416 param_exomiser_analysis_dict.copy() 4417 ) 4418 4419 # Phenopacket JSON file 4420 exomiser_analysis_phenopacket = os.path.join( 4421 tmp_dir, "analysis_phenopacket.json" 4422 ) 4423 with open(exomiser_analysis_phenopacket, "w") as fp: 4424 json.dump( 4425 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4426 fp, 4427 indent=4, 4428 ) 4429 4430 # Analysis JSON file without Phenopacket parameters 4431 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4432 exomiser_analysis_analysis = os.path.join( 4433 tmp_dir, "analysis_analysis.json" 4434 ) 4435 with open(exomiser_analysis_analysis, "w") as fp: 4436 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4437 4438 ### INITAL VCF file ### 4439 ####################### 4440 4441 ### Create list of samples to use and include inti initial VCF file #### 4442 4443 # Subject (main sample) 4444 # Get sample ID in analysis dict 4445 sample_subject = ( 4446 param_exomiser_analysis_dict.get("phenopacket", {}) 4447 .get("subject", {}) 4448 .get("id", None) 4449 ) 4450 sample_proband = ( 4451 param_exomiser_analysis_dict.get("phenopacket", {}) 4452 .get("proband", {}) 4453 .get("subject", {}) 4454 .get("id", None) 4455 ) 4456 sample = [] 4457 if sample_subject: 4458 sample.append(sample_subject) 4459 if sample_proband: 4460 sample.append(sample_proband) 4461 4462 # Get sample ID within Pedigree 4463 pedigree_persons_list = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("pedigree", {}) 4466 .get("persons", {}) 4467 ) 4468 4469 # Create list with all sample ID in pedigree (if exists) 4470 pedigree_persons = [] 4471 for person in pedigree_persons_list: 4472 pedigree_persons.append(person.get("individualId")) 4473 4474 # Concat subject sample ID and samples ID in pedigreesamples 4475 samples = list(set(sample + pedigree_persons)) 4476 4477 # Check if sample list is not empty 4478 if not samples: 4479 log.error(f"No samples found") 4480 raise ValueError(f"No samples found") 4481 4482 # Create VCF with sample (either sample in param or first one by default) 4483 # Export VCF file 4484 self.export_variant_vcf( 4485 vcf_file=tmp_vcf_name, 4486 remove_info=True, 4487 add_samples=True, 4488 list_samples=samples, 4489 index=False, 4490 ) 4491 4492 ### Execute Exomiser ### 4493 ######################## 4494 4495 # Init command 4496 exomiser_command = "" 4497 4498 # Command exomiser options 4499 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4500 4501 # Release 4502 exomiser_release = param_exomiser.get("release", None) 4503 if exomiser_release: 4504 # phenotype data version 4505 exomiser_options += ( 4506 f" --exomiser.phenotype.data-version={exomiser_release} " 4507 ) 4508 # data version 4509 exomiser_options += ( 4510 f" --exomiser.{assembly}.data-version={exomiser_release} " 4511 ) 4512 # variant white list 4513 variant_white_list_file = ( 4514 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4515 ) 4516 if os.path.exists( 4517 os.path.join( 4518 databases_folders, assembly, variant_white_list_file 4519 ) 4520 ): 4521 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4522 4523 # transcript_source 4524 transcript_source = param_exomiser.get( 4525 "transcript_source", None 4526 ) # ucsc, refseq, ensembl 4527 if transcript_source: 4528 exomiser_options += ( 4529 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4530 ) 4531 4532 # If analysis contain proband param 4533 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4534 "proband", {} 4535 ): 4536 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4537 4538 # If no proband (usually uniq sample) 4539 else: 4540 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4541 4542 # Log 4543 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4544 4545 # Run command 4546 result = subprocess.call( 4547 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4548 ) 4549 if result: 4550 log.error("Exomiser command failed") 4551 raise ValueError("Exomiser command failed") 4552 4553 ### RESULTS ### 4554 ############### 4555 4556 ### Annotate with TSV fields ### 4557 4558 # Init result tsv file 4559 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4560 4561 # Init result tsv file 4562 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4563 4564 # Parse TSV file and explode columns in INFO field 4565 if exomiser_to_info and os.path.exists(output_results_tsv): 4566 4567 # Log 4568 log.debug("Exomiser columns to VCF INFO field") 4569 4570 # Retrieve columns and types 4571 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4572 output_results_tsv_df = self.get_query_to_df(query) 4573 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4574 4575 # Init concat fields for update 4576 sql_query_update_concat_fields = [] 4577 4578 # Fields to avoid 4579 fields_to_avoid = [ 4580 "CONTIG", 4581 "START", 4582 "END", 4583 "REF", 4584 "ALT", 4585 "QUAL", 4586 "FILTER", 4587 "GENOTYPE", 4588 ] 4589 4590 # List all columns to add into header 4591 for header_column in output_results_tsv_columns: 4592 4593 # If header column is enable 4594 if header_column not in fields_to_avoid: 4595 4596 # Header info type 4597 header_info_type = "String" 4598 header_column_df = output_results_tsv_df[header_column] 4599 header_column_df_dtype = header_column_df.dtype 4600 if header_column_df_dtype == object: 4601 if ( 4602 pd.to_numeric(header_column_df, errors="coerce") 4603 .notnull() 4604 .all() 4605 ): 4606 header_info_type = "Float" 4607 else: 4608 header_info_type = "Integer" 4609 4610 # Header info 4611 characters_to_validate = ["-"] 4612 pattern = "[" + "".join(characters_to_validate) + "]" 4613 header_info_name = re.sub( 4614 pattern, 4615 "_", 4616 f"Exomiser_{header_column}".replace("#", ""), 4617 ) 4618 header_info_number = "." 4619 header_info_description = ( 4620 f"Exomiser {header_column} annotation" 4621 ) 4622 header_info_source = "Exomiser" 4623 header_info_version = "unknown" 4624 header_info_code = CODE_TYPE_MAP[header_info_type] 4625 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4626 header_info_name, 4627 header_info_number, 4628 header_info_type, 4629 header_info_description, 4630 header_info_source, 4631 header_info_version, 4632 header_info_code, 4633 ) 4634 4635 # Add field to add for update to concat fields 4636 sql_query_update_concat_fields.append( 4637 f""" 4638 CASE 4639 WHEN table_parquet."{header_column}" NOT IN ('','.') 4640 THEN concat( 4641 '{header_info_name}=', 4642 table_parquet."{header_column}", 4643 ';' 4644 ) 4645 4646 ELSE '' 4647 END 4648 """ 4649 ) 4650 4651 # Update query 4652 sql_query_update = f""" 4653 UPDATE {table_variants} as table_variants 4654 SET INFO = concat( 4655 CASE 4656 WHEN INFO NOT IN ('', '.') 4657 THEN INFO 4658 ELSE '' 4659 END, 4660 CASE 4661 WHEN table_variants.INFO NOT IN ('','.') 4662 THEN ';' 4663 ELSE '' 4664 END, 4665 ( 4666 SELECT 4667 concat( 4668 {",".join(sql_query_update_concat_fields)} 4669 ) 4670 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4671 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4672 AND table_parquet.\"START\" = table_variants.\"POS\" 4673 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4674 AND table_parquet.\"REF\" = table_variants.\"REF\" 4675 ) 4676 ) 4677 ; 4678 """ 4679 4680 # Update 4681 self.conn.execute(sql_query_update) 4682 4683 ### Annotate with VCF INFO field ### 4684 4685 # Init result VCF file 4686 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4687 4688 # If VCF exists 4689 if os.path.exists(output_results_vcf): 4690 4691 # Log 4692 log.debug("Exomiser result VCF update variants") 4693 4694 # Find Exomiser INFO field annotation in header 4695 with gzip.open(output_results_vcf, "rt") as f: 4696 header_list = self.read_vcf_header(f) 4697 exomiser_vcf_header = vcf.Reader( 4698 io.StringIO("\n".join(header_list)) 4699 ) 4700 4701 # Add annotation INFO field to header 4702 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4703 4704 # Update variants with VCF 4705 self.update_from_vcf(output_results_vcf) 4706 4707 return True 4708 4709 def annotation_snpeff(self, threads: int = None) -> None: 4710 """ 4711 This function annotate with snpEff 4712 4713 :param threads: The number of threads to use 4714 :return: the value of the variable "return_value". 4715 """ 4716 4717 # DEBUG 4718 log.debug("Start annotation with snpeff databases") 4719 4720 # Threads 4721 if not threads: 4722 threads = self.get_threads() 4723 log.debug("Threads: " + str(threads)) 4724 4725 # DEBUG 4726 delete_tmp = True 4727 if self.get_config().get("verbosity", "warning") in ["debug"]: 4728 delete_tmp = False 4729 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4730 4731 # Config 4732 config = self.get_config() 4733 log.debug("Config: " + str(config)) 4734 4735 # Config - Folders - Databases 4736 databases_folders = ( 4737 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4738 ) 4739 log.debug("Databases annotations: " + str(databases_folders)) 4740 4741 # # Config - Java 4742 # java_bin = get_bin( 4743 # tool="java", 4744 # bin="java", 4745 # bin_type="bin", 4746 # config=config, 4747 # default_folder="/usr/bin", 4748 # ) 4749 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4750 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4751 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4752 4753 # # Config - snpEff bin 4754 # snpeff_jar = get_bin( 4755 # tool="snpeff", 4756 # bin="snpEff.jar", 4757 # bin_type="jar", 4758 # config=config, 4759 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4760 # ) 4761 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4762 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4764 4765 # Config - snpEff bin command 4766 snpeff_bin_command = get_bin_command( 4767 bin="snpEff.jar", 4768 tool="snpeff", 4769 bin_type="jar", 4770 config=config, 4771 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 ) 4773 if not snpeff_bin_command: 4774 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4775 log.error(msg_err) 4776 raise ValueError(msg_err) 4777 4778 # Config - snpEff databases 4779 snpeff_databases = ( 4780 config.get("folders", {}) 4781 .get("databases", {}) 4782 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4783 ) 4784 snpeff_databases = full_path(snpeff_databases) 4785 if snpeff_databases is not None and snpeff_databases != "": 4786 log.debug(f"Create snpEff databases folder") 4787 if not os.path.exists(snpeff_databases): 4788 os.makedirs(snpeff_databases) 4789 4790 # Param 4791 param = self.get_param() 4792 log.debug("Param: " + str(param)) 4793 4794 # Param 4795 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4796 log.debug("Options: " + str(options)) 4797 4798 # Param - Assembly 4799 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4800 4801 # Param - Options 4802 snpeff_options = ( 4803 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4804 ) 4805 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4806 snpeff_csvstats = ( 4807 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4808 ) 4809 if snpeff_stats: 4810 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4811 snpeff_stats = full_path(snpeff_stats) 4812 snpeff_options += f" -stats {snpeff_stats}" 4813 if snpeff_csvstats: 4814 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4815 snpeff_csvstats = full_path(snpeff_csvstats) 4816 snpeff_options += f" -csvStats {snpeff_csvstats}" 4817 4818 # Data 4819 table_variants = self.get_table_variants() 4820 4821 # Check if not empty 4822 log.debug("Check if not empty") 4823 sql_query_chromosomes = ( 4824 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4825 ) 4826 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4827 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4828 log.info(f"VCF empty") 4829 return 4830 4831 # Export in VCF 4832 log.debug("Create initial file to annotate") 4833 tmp_vcf = NamedTemporaryFile( 4834 prefix=self.get_prefix(), 4835 dir=self.get_tmp_dir(), 4836 suffix=".vcf.gz", 4837 delete=True, 4838 ) 4839 tmp_vcf_name = tmp_vcf.name 4840 4841 # VCF header 4842 vcf_reader = self.get_header() 4843 log.debug("Initial header: " + str(vcf_reader.infos)) 4844 4845 # Existing annotations 4846 for vcf_annotation in self.get_header().infos: 4847 4848 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4849 log.debug( 4850 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4851 ) 4852 4853 # Memory limit 4854 # if config.get("memory", None): 4855 # memory_limit = config.get("memory", "8G") 4856 # else: 4857 # memory_limit = "8G" 4858 memory_limit = self.get_memory("8G") 4859 log.debug(f"memory_limit: {memory_limit}") 4860 4861 # snpEff java options 4862 snpeff_java_options = ( 4863 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4864 ) 4865 log.debug(f"Exomiser java options: {snpeff_java_options}") 4866 4867 force_update_annotation = True 4868 4869 if "ANN" not in self.get_header().infos or force_update_annotation: 4870 4871 # Check snpEff database 4872 log.debug(f"Check snpEff databases {[assembly]}") 4873 databases_download_snpeff( 4874 folder=snpeff_databases, assemblies=[assembly], config=config 4875 ) 4876 4877 # Export VCF file 4878 self.export_variant_vcf( 4879 vcf_file=tmp_vcf_name, 4880 remove_info=True, 4881 add_samples=False, 4882 index=True, 4883 ) 4884 4885 # Tmp file 4886 err_files = [] 4887 tmp_annotate_vcf = NamedTemporaryFile( 4888 prefix=self.get_prefix(), 4889 dir=self.get_tmp_dir(), 4890 suffix=".vcf", 4891 delete=False, 4892 ) 4893 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4894 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4895 err_files.append(tmp_annotate_vcf_name_err) 4896 4897 # Command 4898 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4899 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4900 run_parallel_commands([snpeff_command], 1) 4901 4902 # Error messages 4903 log.info(f"Error/Warning messages:") 4904 error_message_command_all = [] 4905 error_message_command_warning = [] 4906 error_message_command_err = [] 4907 for err_file in err_files: 4908 with open(err_file, "r") as f: 4909 for line in f: 4910 message = line.strip() 4911 error_message_command_all.append(message) 4912 if line.startswith("[W::"): 4913 error_message_command_warning.append(message) 4914 if line.startswith("[E::"): 4915 error_message_command_err.append(f"{err_file}: " + message) 4916 # log info 4917 for message in list( 4918 set(error_message_command_err + error_message_command_warning) 4919 ): 4920 log.info(f" {message}") 4921 # debug info 4922 for message in list(set(error_message_command_all)): 4923 log.debug(f" {message}") 4924 # failed 4925 if len(error_message_command_err): 4926 log.error("Annotation failed: Error in commands") 4927 raise ValueError("Annotation failed: Error in commands") 4928 4929 # Find annotation in header 4930 with open(tmp_annotate_vcf_name, "rt") as f: 4931 header_list = self.read_vcf_header(f) 4932 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4933 4934 for ann in annovar_vcf_header.infos: 4935 if ann not in self.get_header().infos: 4936 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4937 4938 # Update variants 4939 log.info(f"Annotation - Updating...") 4940 self.update_from_vcf(tmp_annotate_vcf_name) 4941 4942 else: 4943 if "ANN" in self.get_header().infos: 4944 log.debug(f"Existing snpEff annotations in VCF") 4945 if force_update_annotation: 4946 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 4947 4948 def annotation_annovar(self, threads: int = None) -> None: 4949 """ 4950 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4951 annotations 4952 4953 :param threads: number of threads to use 4954 :return: the value of the variable "return_value". 4955 """ 4956 4957 # DEBUG 4958 log.debug("Start annotation with Annovar databases") 4959 4960 # Threads 4961 if not threads: 4962 threads = self.get_threads() 4963 log.debug("Threads: " + str(threads)) 4964 4965 # Tmp en Err files 4966 tmp_files = [] 4967 err_files = [] 4968 4969 # DEBUG 4970 delete_tmp = True 4971 if self.get_config().get("verbosity", "warning") in ["debug"]: 4972 delete_tmp = False 4973 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4974 4975 # Config 4976 config = self.get_config() 4977 log.debug("Config: " + str(config)) 4978 4979 # Config - Folders - Databases 4980 databases_folders = ( 4981 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4982 ) 4983 log.debug("Databases annotations: " + str(databases_folders)) 4984 4985 # Config - annovar bin command 4986 annovar_bin_command = get_bin_command( 4987 bin="table_annovar.pl", 4988 tool="annovar", 4989 bin_type="perl", 4990 config=config, 4991 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4992 ) 4993 if not annovar_bin_command: 4994 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4995 log.error(msg_err) 4996 raise ValueError(msg_err) 4997 4998 # Config - BCFTools bin command 4999 bcftools_bin_command = get_bin_command( 5000 bin="bcftools", 5001 tool="bcftools", 5002 bin_type="bin", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5005 ) 5006 if not bcftools_bin_command: 5007 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - annovar databases 5012 annovar_databases = ( 5013 config.get("folders", {}) 5014 .get("databases", {}) 5015 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5016 ) 5017 annovar_databases = full_path(annovar_databases) 5018 if annovar_databases != "" and not os.path.exists(annovar_databases): 5019 os.makedirs(annovar_databases) 5020 5021 # Param 5022 param = self.get_param() 5023 log.debug("Param: " + str(param)) 5024 5025 # Param - options 5026 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5027 log.debug("Options: " + str(options)) 5028 5029 # Param - annotations 5030 annotations = ( 5031 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5032 ) 5033 log.debug("Annotations: " + str(annotations)) 5034 5035 # Param - Assembly 5036 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5037 5038 # Annovar database assembly 5039 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5040 if annovar_databases_assembly != "" and not os.path.exists( 5041 annovar_databases_assembly 5042 ): 5043 os.makedirs(annovar_databases_assembly) 5044 5045 # Data 5046 table_variants = self.get_table_variants() 5047 5048 # Check if not empty 5049 log.debug("Check if not empty") 5050 sql_query_chromosomes = ( 5051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5052 ) 5053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5054 if not sql_query_chromosomes_df["count"][0]: 5055 log.info(f"VCF empty") 5056 return 5057 5058 # VCF header 5059 vcf_reader = self.get_header() 5060 log.debug("Initial header: " + str(vcf_reader.infos)) 5061 5062 # Existing annotations 5063 for vcf_annotation in self.get_header().infos: 5064 5065 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5066 log.debug( 5067 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5068 ) 5069 5070 force_update_annotation = True 5071 5072 if annotations: 5073 5074 commands = [] 5075 tmp_annotates_vcf_name_list = [] 5076 5077 # Export in VCF 5078 log.debug("Create initial file to annotate") 5079 tmp_vcf = NamedTemporaryFile( 5080 prefix=self.get_prefix(), 5081 dir=self.get_tmp_dir(), 5082 suffix=".vcf.gz", 5083 delete=False, 5084 ) 5085 tmp_vcf_name = tmp_vcf.name 5086 tmp_files.append(tmp_vcf_name) 5087 tmp_files.append(tmp_vcf_name + ".tbi") 5088 5089 # Export VCF file 5090 self.export_variant_vcf( 5091 vcf_file=tmp_vcf_name, 5092 remove_info=".", 5093 add_samples=False, 5094 index=True, 5095 ) 5096 5097 # Create file for field rename 5098 log.debug("Create file for field rename") 5099 tmp_rename = NamedTemporaryFile( 5100 prefix=self.get_prefix(), 5101 dir=self.get_tmp_dir(), 5102 suffix=".rename", 5103 delete=False, 5104 ) 5105 tmp_rename_name = tmp_rename.name 5106 tmp_files.append(tmp_rename_name) 5107 5108 # Check Annovar database 5109 log.debug( 5110 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5111 ) 5112 databases_download_annovar( 5113 folder=annovar_databases, 5114 files=list(annotations.keys()), 5115 assemblies=[assembly], 5116 ) 5117 5118 for annotation in annotations: 5119 annotation_fields = annotations[annotation] 5120 5121 if not annotation_fields: 5122 annotation_fields = {"INFO": None} 5123 5124 log.info(f"Annotations Annovar - database '{annotation}'") 5125 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5126 5127 # Tmp file for annovar 5128 err_files = [] 5129 tmp_annotate_vcf_directory = TemporaryDirectory( 5130 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5131 ) 5132 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5133 tmp_annotate_vcf_name_annovar = ( 5134 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5135 ) 5136 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5137 err_files.append(tmp_annotate_vcf_name_err) 5138 tmp_files.append(tmp_annotate_vcf_name_err) 5139 5140 # Tmp file final vcf annotated by annovar 5141 tmp_annotate_vcf = NamedTemporaryFile( 5142 prefix=self.get_prefix(), 5143 dir=self.get_tmp_dir(), 5144 suffix=".vcf.gz", 5145 delete=False, 5146 ) 5147 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5148 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name) 5150 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5151 5152 # Number of fields 5153 annotation_list = [] 5154 annotation_renamed_list = [] 5155 5156 for annotation_field in annotation_fields: 5157 5158 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5159 annotation_fields_new_name = annotation_fields.get( 5160 annotation_field, annotation_field 5161 ) 5162 if not annotation_fields_new_name: 5163 annotation_fields_new_name = annotation_field 5164 5165 if ( 5166 force_update_annotation 5167 or annotation_fields_new_name not in self.get_header().infos 5168 ): 5169 annotation_list.append(annotation_field) 5170 annotation_renamed_list.append(annotation_fields_new_name) 5171 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5172 log.warning( 5173 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5174 ) 5175 5176 # Add rename info 5177 run_parallel_commands( 5178 [ 5179 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5180 ], 5181 1, 5182 ) 5183 5184 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5185 log.debug("annotation_list: " + str(annotation_list)) 5186 5187 # protocol 5188 protocol = annotation 5189 5190 # argument 5191 argument = "" 5192 5193 # operation 5194 operation = "f" 5195 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5196 "ensGene" 5197 ): 5198 operation = "g" 5199 if options.get("genebase", None): 5200 argument = f"""'{options.get("genebase","")}'""" 5201 elif annotation in ["cytoBand"]: 5202 operation = "r" 5203 5204 # argument option 5205 argument_option = "" 5206 if argument != "": 5207 argument_option = " --argument " + argument 5208 5209 # command options 5210 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5211 for option in options: 5212 if option not in ["genebase"]: 5213 command_options += f""" --{option}={options[option]}""" 5214 5215 # Command 5216 5217 # Command - Annovar 5218 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5219 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5220 5221 # Command - start pipe 5222 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5223 5224 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5225 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5226 5227 # Command - Special characters (refGene annotation) 5228 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5229 5230 # Command - Clean empty fields (with value ".") 5231 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5232 5233 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5234 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5235 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5236 # for ann in annotation_renamed_list: 5237 for ann in annotation_list: 5238 annovar_fields_to_keep.append(f"^INFO/{ann}") 5239 5240 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5241 5242 # Command - indexing 5243 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5244 5245 log.debug(f"Annotation - Annovar command: {command_annovar}") 5246 run_parallel_commands([command_annovar], 1) 5247 5248 # Error messages 5249 log.info(f"Error/Warning messages:") 5250 error_message_command_all = [] 5251 error_message_command_warning = [] 5252 error_message_command_err = [] 5253 for err_file in err_files: 5254 with open(err_file, "r") as f: 5255 for line in f: 5256 message = line.strip() 5257 error_message_command_all.append(message) 5258 if line.startswith("[W::") or line.startswith("WARNING"): 5259 error_message_command_warning.append(message) 5260 if line.startswith("[E::") or line.startswith("ERROR"): 5261 error_message_command_err.append( 5262 f"{err_file}: " + message 5263 ) 5264 # log info 5265 for message in list( 5266 set(error_message_command_err + error_message_command_warning) 5267 ): 5268 log.info(f" {message}") 5269 # debug info 5270 for message in list(set(error_message_command_all)): 5271 log.debug(f" {message}") 5272 # failed 5273 if len(error_message_command_err): 5274 log.error("Annotation failed: Error in commands") 5275 raise ValueError("Annotation failed: Error in commands") 5276 5277 if tmp_annotates_vcf_name_list: 5278 5279 # List of annotated files 5280 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5281 5282 # Tmp file 5283 tmp_annotate_vcf = NamedTemporaryFile( 5284 prefix=self.get_prefix(), 5285 dir=self.get_tmp_dir(), 5286 suffix=".vcf.gz", 5287 delete=False, 5288 ) 5289 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5290 tmp_files.append(tmp_annotate_vcf_name) 5291 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5292 err_files.append(tmp_annotate_vcf_name_err) 5293 tmp_files.append(tmp_annotate_vcf_name_err) 5294 5295 # Command merge 5296 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5297 log.info( 5298 f"Annotation Annovar - Annotation merging " 5299 + str(len(tmp_annotates_vcf_name_list)) 5300 + " annotated files" 5301 ) 5302 log.debug(f"Annotation - merge command: {merge_command}") 5303 run_parallel_commands([merge_command], 1) 5304 5305 # Find annotation in header 5306 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5307 header_list = self.read_vcf_header(f) 5308 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5309 5310 for ann in annovar_vcf_header.infos: 5311 if ann not in self.get_header().infos: 5312 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5313 5314 # Update variants 5315 log.info(f"Annotation Annovar - Updating...") 5316 self.update_from_vcf(tmp_annotate_vcf_name) 5317 5318 # Clean files 5319 # Tmp file remove command 5320 if True: 5321 tmp_files_remove_command = "" 5322 if tmp_files: 5323 tmp_files_remove_command = " ".join(tmp_files) 5324 clean_command = f" rm -f {tmp_files_remove_command} " 5325 log.debug(f"Annotation Annovar - Annotation cleaning ") 5326 log.debug(f"Annotation - cleaning command: {clean_command}") 5327 run_parallel_commands([clean_command], 1) 5328 5329 # Parquet 5330 def annotation_parquet(self, threads: int = None) -> None: 5331 """ 5332 It takes a VCF file, and annotates it with a parquet file 5333 5334 :param threads: number of threads to use for the annotation 5335 :return: the value of the variable "result". 5336 """ 5337 5338 # DEBUG 5339 log.debug("Start annotation with parquet databases") 5340 5341 # Threads 5342 if not threads: 5343 threads = self.get_threads() 5344 log.debug("Threads: " + str(threads)) 5345 5346 # DEBUG 5347 delete_tmp = True 5348 if self.get_config().get("verbosity", "warning") in ["debug"]: 5349 delete_tmp = False 5350 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5351 5352 # Config 5353 databases_folders = set( 5354 self.get_config() 5355 .get("folders", {}) 5356 .get("databases", {}) 5357 .get("annotations", ["."]) 5358 + self.get_config() 5359 .get("folders", {}) 5360 .get("databases", {}) 5361 .get("parquet", ["."]) 5362 ) 5363 log.debug("Databases annotations: " + str(databases_folders)) 5364 5365 # Param 5366 annotations = ( 5367 self.get_param() 5368 .get("annotation", {}) 5369 .get("parquet", {}) 5370 .get("annotations", None) 5371 ) 5372 log.debug("Annotations: " + str(annotations)) 5373 5374 # Assembly 5375 assembly = self.get_param().get( 5376 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5377 ) 5378 5379 # Force Update Annotation 5380 force_update_annotation = ( 5381 self.get_param() 5382 .get("annotation", {}) 5383 .get("options", {}) 5384 .get("annotations_update", False) 5385 ) 5386 log.debug(f"force_update_annotation={force_update_annotation}") 5387 force_append_annotation = ( 5388 self.get_param() 5389 .get("annotation", {}) 5390 .get("options", {}) 5391 .get("annotations_append", False) 5392 ) 5393 log.debug(f"force_append_annotation={force_append_annotation}") 5394 5395 # Data 5396 table_variants = self.get_table_variants() 5397 5398 # Check if not empty 5399 log.debug("Check if not empty") 5400 sql_query_chromosomes_df = self.get_query_to_df( 5401 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5402 ) 5403 if not sql_query_chromosomes_df["count"][0]: 5404 log.info(f"VCF empty") 5405 return 5406 5407 # VCF header 5408 vcf_reader = self.get_header() 5409 log.debug("Initial header: " + str(vcf_reader.infos)) 5410 5411 # Nb Variants POS 5412 log.debug("NB Variants Start") 5413 nb_variants = self.conn.execute( 5414 f"SELECT count(*) AS count FROM variants" 5415 ).fetchdf()["count"][0] 5416 log.debug("NB Variants Stop") 5417 5418 # Existing annotations 5419 for vcf_annotation in self.get_header().infos: 5420 5421 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5422 log.debug( 5423 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5424 ) 5425 5426 # Added columns 5427 added_columns = [] 5428 5429 # drop indexes 5430 log.debug(f"Drop indexes...") 5431 self.drop_indexes() 5432 5433 if annotations: 5434 5435 if "ALL" in annotations: 5436 5437 all_param = annotations.get("ALL", {}) 5438 all_param_formats = all_param.get("formats", None) 5439 all_param_releases = all_param.get("releases", None) 5440 5441 databases_infos_dict = self.scan_databases( 5442 database_formats=all_param_formats, 5443 database_releases=all_param_releases, 5444 ) 5445 for database_infos in databases_infos_dict.keys(): 5446 if database_infos not in annotations: 5447 annotations[database_infos] = {"INFO": None} 5448 5449 for annotation in annotations: 5450 5451 if annotation in ["ALL"]: 5452 continue 5453 5454 # Annotation Name 5455 annotation_name = os.path.basename(annotation) 5456 5457 # Annotation fields 5458 annotation_fields = annotations[annotation] 5459 if not annotation_fields: 5460 annotation_fields = {"INFO": None} 5461 5462 log.debug(f"Annotation '{annotation_name}'") 5463 log.debug( 5464 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5465 ) 5466 5467 # Create Database 5468 database = Database( 5469 database=annotation, 5470 databases_folders=databases_folders, 5471 assembly=assembly, 5472 ) 5473 5474 # Find files 5475 parquet_file = database.get_database() 5476 parquet_hdr_file = database.get_header_file() 5477 parquet_type = database.get_type() 5478 5479 # Check if files exists 5480 if not parquet_file or not parquet_hdr_file: 5481 log.error("Annotation failed: file not found") 5482 raise ValueError("Annotation failed: file not found") 5483 else: 5484 # Get parquet connexion 5485 parquet_sql_attach = database.get_sql_database_attach( 5486 output="query" 5487 ) 5488 if parquet_sql_attach: 5489 self.conn.execute(parquet_sql_attach) 5490 parquet_file_link = database.get_sql_database_link() 5491 # Log 5492 log.debug( 5493 f"Annotation '{annotation_name}' - file: " 5494 + str(parquet_file) 5495 + " and " 5496 + str(parquet_hdr_file) 5497 ) 5498 5499 # Database full header columns 5500 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5501 parquet_hdr_file 5502 ) 5503 # Log 5504 log.debug( 5505 "Annotation database header columns : " 5506 + str(parquet_hdr_vcf_header_columns) 5507 ) 5508 5509 # Load header as VCF object 5510 parquet_hdr_vcf_header_infos = database.get_header().infos 5511 # Log 5512 log.debug( 5513 "Annotation database header: " 5514 + str(parquet_hdr_vcf_header_infos) 5515 ) 5516 5517 # Get extra infos 5518 parquet_columns = database.get_extra_columns() 5519 # Log 5520 log.debug("Annotation database Columns: " + str(parquet_columns)) 5521 5522 # Add extra columns if "ALL" in annotation_fields 5523 # if "ALL" in annotation_fields: 5524 # allow_add_extra_column = True 5525 if "ALL" in annotation_fields and database.get_extra_columns(): 5526 for extra_column in database.get_extra_columns(): 5527 if ( 5528 extra_column not in annotation_fields 5529 and extra_column.replace("INFO/", "") 5530 not in parquet_hdr_vcf_header_infos 5531 ): 5532 parquet_hdr_vcf_header_infos[extra_column] = ( 5533 vcf.parser._Info( 5534 extra_column, 5535 ".", 5536 "String", 5537 f"{extra_column} description", 5538 "unknown", 5539 "unknown", 5540 self.code_type_map["String"], 5541 ) 5542 ) 5543 5544 # For all fields in database 5545 annotation_fields_all = False 5546 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5547 annotation_fields_all = True 5548 annotation_fields = { 5549 key: key for key in parquet_hdr_vcf_header_infos 5550 } 5551 5552 log.debug( 5553 "Annotation database header - All annotations added: " 5554 + str(annotation_fields) 5555 ) 5556 5557 # Init 5558 5559 # List of annotation fields to use 5560 sql_query_annotation_update_info_sets = [] 5561 5562 # List of annotation to agregate 5563 sql_query_annotation_to_agregate = [] 5564 5565 # Number of fields 5566 nb_annotation_field = 0 5567 5568 # Annotation fields processed 5569 annotation_fields_processed = [] 5570 5571 # Columns mapping 5572 map_columns = database.map_columns( 5573 columns=annotation_fields, prefixes=["INFO/"] 5574 ) 5575 5576 # Query dict for fields to remove (update option) 5577 query_dict_remove = {} 5578 5579 # Fetch Anotation fields 5580 for annotation_field in annotation_fields: 5581 5582 # annotation_field_column 5583 annotation_field_column = map_columns.get( 5584 annotation_field, "INFO" 5585 ) 5586 5587 # field new name, if parametered 5588 annotation_fields_new_name = annotation_fields.get( 5589 annotation_field, annotation_field 5590 ) 5591 if not annotation_fields_new_name: 5592 annotation_fields_new_name = annotation_field 5593 5594 # To annotate 5595 # force_update_annotation = True 5596 # force_append_annotation = True 5597 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5598 if annotation_field in parquet_hdr_vcf_header_infos and ( 5599 force_update_annotation 5600 or force_append_annotation 5601 or ( 5602 annotation_fields_new_name 5603 not in self.get_header().infos 5604 ) 5605 ): 5606 5607 # Add field to annotation to process list 5608 annotation_fields_processed.append( 5609 annotation_fields_new_name 5610 ) 5611 5612 # explode infos for the field 5613 annotation_fields_new_name_info_msg = "" 5614 if ( 5615 force_update_annotation 5616 and annotation_fields_new_name 5617 in self.get_header().infos 5618 ): 5619 # Remove field from INFO 5620 query = f""" 5621 UPDATE {table_variants} as table_variants 5622 SET INFO = REGEXP_REPLACE( 5623 concat(table_variants.INFO,''), 5624 ';*{annotation_fields_new_name}=[^;]*', 5625 '' 5626 ) 5627 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5628 """ 5629 annotation_fields_new_name_info_msg = " [update]" 5630 query_dict_remove[ 5631 f"remove 'INFO/{annotation_fields_new_name}'" 5632 ] = query 5633 5634 # Sep between fields in INFO 5635 nb_annotation_field += 1 5636 if nb_annotation_field > 1: 5637 annotation_field_sep = ";" 5638 else: 5639 annotation_field_sep = "" 5640 5641 log.info( 5642 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5643 ) 5644 5645 # Add INFO field to header 5646 parquet_hdr_vcf_header_infos_number = ( 5647 parquet_hdr_vcf_header_infos[annotation_field].num 5648 or "." 5649 ) 5650 parquet_hdr_vcf_header_infos_type = ( 5651 parquet_hdr_vcf_header_infos[annotation_field].type 5652 or "String" 5653 ) 5654 parquet_hdr_vcf_header_infos_description = ( 5655 parquet_hdr_vcf_header_infos[annotation_field].desc 5656 or f"{annotation_field} description" 5657 ) 5658 parquet_hdr_vcf_header_infos_source = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].source 5660 or "unknown" 5661 ) 5662 parquet_hdr_vcf_header_infos_version = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].version 5664 or "unknown" 5665 ) 5666 5667 vcf_reader.infos[annotation_fields_new_name] = ( 5668 vcf.parser._Info( 5669 annotation_fields_new_name, 5670 parquet_hdr_vcf_header_infos_number, 5671 parquet_hdr_vcf_header_infos_type, 5672 parquet_hdr_vcf_header_infos_description, 5673 parquet_hdr_vcf_header_infos_source, 5674 parquet_hdr_vcf_header_infos_version, 5675 self.code_type_map[ 5676 parquet_hdr_vcf_header_infos_type 5677 ], 5678 ) 5679 ) 5680 5681 # Append 5682 if force_append_annotation: 5683 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5684 else: 5685 query_case_when_append = "" 5686 5687 # Annotation/Update query fields 5688 # Found in INFO column 5689 if ( 5690 annotation_field_column == "INFO" 5691 and "INFO" in parquet_hdr_vcf_header_columns 5692 ): 5693 sql_query_annotation_update_info_sets.append( 5694 f""" 5695 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5696 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5697 ELSE '' 5698 END 5699 """ 5700 ) 5701 # Found in a specific column 5702 else: 5703 sql_query_annotation_update_info_sets.append( 5704 f""" 5705 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5706 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5707 ELSE '' 5708 END 5709 """ 5710 ) 5711 sql_query_annotation_to_agregate.append( 5712 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5713 ) 5714 5715 # Not to annotate 5716 else: 5717 5718 if force_update_annotation: 5719 annotation_message = "forced" 5720 else: 5721 annotation_message = "skipped" 5722 5723 if annotation_field not in parquet_hdr_vcf_header_infos: 5724 log.warning( 5725 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5726 ) 5727 if annotation_fields_new_name in self.get_header().infos: 5728 log.warning( 5729 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5730 ) 5731 5732 # Check if ALL fields have to be annotated. Thus concat all INFO field 5733 # allow_annotation_full_info = True 5734 allow_annotation_full_info = not force_append_annotation 5735 5736 if parquet_type in ["regions"]: 5737 allow_annotation_full_info = False 5738 5739 if ( 5740 allow_annotation_full_info 5741 and nb_annotation_field == len(annotation_fields) 5742 and annotation_fields_all 5743 and ( 5744 "INFO" in parquet_hdr_vcf_header_columns 5745 and "INFO" in database.get_extra_columns() 5746 ) 5747 ): 5748 log.debug("Column INFO annotation enabled") 5749 sql_query_annotation_update_info_sets = [] 5750 sql_query_annotation_update_info_sets.append( 5751 f" table_parquet.INFO " 5752 ) 5753 5754 if sql_query_annotation_update_info_sets: 5755 5756 # Annotate 5757 log.info(f"Annotation '{annotation_name}' - Annotation...") 5758 5759 # Join query annotation update info sets for SQL 5760 sql_query_annotation_update_info_sets_sql = ",".join( 5761 sql_query_annotation_update_info_sets 5762 ) 5763 5764 # Check chromosomes list (and variants infos) 5765 sql_query_chromosomes = f""" 5766 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5767 FROM {table_variants} as table_variants 5768 GROUP BY table_variants."#CHROM" 5769 ORDER BY table_variants."#CHROM" 5770 """ 5771 sql_query_chromosomes_df = self.conn.execute( 5772 sql_query_chromosomes 5773 ).df() 5774 sql_query_chromosomes_dict = { 5775 entry["CHROM"]: { 5776 "count": entry["count_variants"], 5777 "min": entry["min_variants"], 5778 "max": entry["max_variants"], 5779 } 5780 for index, entry in sql_query_chromosomes_df.iterrows() 5781 } 5782 5783 # Init 5784 nb_of_query = 0 5785 nb_of_variant_annotated = 0 5786 query_dict = query_dict_remove 5787 5788 # for chrom in sql_query_chromosomes_df["CHROM"]: 5789 for chrom in sql_query_chromosomes_dict: 5790 5791 # Number of variant by chromosome 5792 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5793 chrom, {} 5794 ).get("count", 0) 5795 5796 log.debug( 5797 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5798 ) 5799 5800 # Annotation with regions database 5801 if parquet_type in ["regions"]: 5802 sql_query_annotation_from_clause = f""" 5803 FROM ( 5804 SELECT 5805 '{chrom}' AS \"#CHROM\", 5806 table_variants_from.\"POS\" AS \"POS\", 5807 {",".join(sql_query_annotation_to_agregate)} 5808 FROM {table_variants} as table_variants_from 5809 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5810 table_parquet_from."#CHROM" = '{chrom}' 5811 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5812 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5813 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5814 ) 5815 ) 5816 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5817 GROUP BY table_variants_from.\"POS\" 5818 ) 5819 as table_parquet 5820 """ 5821 5822 sql_query_annotation_where_clause = """ 5823 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5824 AND table_parquet.\"POS\" = table_variants.\"POS\" 5825 """ 5826 5827 # Annotation with variants database 5828 else: 5829 sql_query_annotation_from_clause = f""" 5830 FROM {parquet_file_link} as table_parquet 5831 """ 5832 sql_query_annotation_where_clause = f""" 5833 table_variants."#CHROM" = '{chrom}' 5834 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5835 AND table_parquet.\"POS\" = table_variants.\"POS\" 5836 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5837 AND table_parquet.\"REF\" = table_variants.\"REF\" 5838 """ 5839 5840 # Create update query 5841 sql_query_annotation_chrom_interval_pos = f""" 5842 UPDATE {table_variants} as table_variants 5843 SET INFO = 5844 concat( 5845 CASE WHEN table_variants.INFO NOT IN ('','.') 5846 THEN table_variants.INFO 5847 ELSE '' 5848 END 5849 , 5850 CASE WHEN table_variants.INFO NOT IN ('','.') 5851 AND ( 5852 concat({sql_query_annotation_update_info_sets_sql}) 5853 ) 5854 NOT IN ('','.') 5855 THEN ';' 5856 ELSE '' 5857 END 5858 , 5859 {sql_query_annotation_update_info_sets_sql} 5860 ) 5861 {sql_query_annotation_from_clause} 5862 WHERE {sql_query_annotation_where_clause} 5863 ; 5864 """ 5865 5866 # Add update query to dict 5867 query_dict[ 5868 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5869 ] = sql_query_annotation_chrom_interval_pos 5870 5871 nb_of_query = len(query_dict) 5872 num_query = 0 5873 5874 # SET max_expression_depth TO x 5875 self.conn.execute("SET max_expression_depth TO 10000") 5876 5877 for query_name in query_dict: 5878 query = query_dict[query_name] 5879 num_query += 1 5880 log.info( 5881 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5882 ) 5883 result = self.conn.execute(query) 5884 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5885 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5886 log.info( 5887 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5888 ) 5889 5890 log.info( 5891 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5892 ) 5893 5894 else: 5895 5896 log.info( 5897 f"Annotation '{annotation_name}' - No Annotations available" 5898 ) 5899 5900 log.debug("Final header: " + str(vcf_reader.infos)) 5901 5902 # Remove added columns 5903 for added_column in added_columns: 5904 self.drop_column(column=added_column) 5905 5906 def annotation_splice(self, threads: int = None) -> None: 5907 """ 5908 This function annotate with snpEff 5909 5910 :param threads: The number of threads to use 5911 :return: the value of the variable "return_value". 5912 """ 5913 5914 # DEBUG 5915 log.debug("Start annotation with splice tools") 5916 5917 # Threads 5918 if not threads: 5919 threads = self.get_threads() 5920 log.debug("Threads: " + str(threads)) 5921 5922 # DEBUG 5923 delete_tmp = True 5924 if self.get_config().get("verbosity", "warning") in ["debug"]: 5925 delete_tmp = False 5926 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5927 5928 # Config 5929 config = self.get_config() 5930 log.debug("Config: " + str(config)) 5931 splice_config = config.get("tools", {}).get("splice", {}) 5932 if not splice_config: 5933 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5934 if not splice_config: 5935 msg_err = "No Splice tool config" 5936 log.error(msg_err) 5937 raise ValueError(msg_err) 5938 log.debug(f"splice_config={splice_config}") 5939 5940 # Config - Folders - Databases 5941 databases_folders = ( 5942 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5943 ) 5944 log.debug("Databases annotations: " + str(databases_folders)) 5945 5946 # Splice docker image 5947 splice_docker_image = splice_config.get("docker").get("image") 5948 5949 # Pull splice image if it's not already there 5950 if not check_docker_image_exists(splice_docker_image): 5951 log.warning( 5952 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5953 ) 5954 try: 5955 command(f"docker pull {splice_config.get('docker').get('image')}") 5956 except subprocess.CalledProcessError: 5957 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5958 log.error(msg_err) 5959 raise ValueError(msg_err) 5960 return None 5961 5962 # Config - splice databases 5963 splice_databases = ( 5964 config.get("folders", {}) 5965 .get("databases", {}) 5966 .get("splice", DEFAULT_SPLICE_FOLDER) 5967 ) 5968 splice_databases = full_path(splice_databases) 5969 5970 # Param 5971 param = self.get_param() 5972 log.debug("Param: " + str(param)) 5973 5974 # Param 5975 options = param.get("annotation", {}).get("splice", {}) 5976 log.debug("Options: " + str(options)) 5977 5978 # Data 5979 table_variants = self.get_table_variants() 5980 5981 # Check if not empty 5982 log.debug("Check if not empty") 5983 sql_query_chromosomes = ( 5984 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5985 ) 5986 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5987 log.info("VCF empty") 5988 return None 5989 5990 # Export in VCF 5991 log.debug("Create initial file to annotate") 5992 5993 # Create output folder 5994 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5995 if not os.path.exists(output_folder): 5996 Path(output_folder).mkdir(parents=True, exist_ok=True) 5997 5998 # Create tmp VCF file 5999 tmp_vcf = NamedTemporaryFile( 6000 prefix=self.get_prefix(), 6001 dir=output_folder, 6002 suffix=".vcf", 6003 delete=False, 6004 ) 6005 tmp_vcf_name = tmp_vcf.name 6006 6007 # VCF header 6008 header = self.get_header() 6009 6010 # Existing annotations 6011 for vcf_annotation in self.get_header().infos: 6012 6013 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6014 log.debug( 6015 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6016 ) 6017 6018 # Memory limit 6019 if config.get("memory", None): 6020 memory_limit = config.get("memory", "8G").upper() 6021 # upper() 6022 else: 6023 memory_limit = "8G" 6024 log.debug(f"memory_limit: {memory_limit}") 6025 6026 # Check number of variants to annotate 6027 where_clause_regex_spliceai = r"SpliceAI_\w+" 6028 where_clause_regex_spip = r"SPiP_\w+" 6029 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6030 df_list_of_variants_to_annotate = self.get_query_to_df( 6031 query=f""" SELECT * FROM variants {where_clause} """ 6032 ) 6033 if len(df_list_of_variants_to_annotate) == 0: 6034 log.warning( 6035 f"No variants to annotate with splice. Variants probably already annotated with splice" 6036 ) 6037 return None 6038 else: 6039 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6040 6041 # Export VCF file 6042 self.export_variant_vcf( 6043 vcf_file=tmp_vcf_name, 6044 remove_info=True, 6045 add_samples=True, 6046 index=False, 6047 where_clause=where_clause, 6048 ) 6049 6050 # Create docker container and launch splice analysis 6051 if splice_config: 6052 6053 # Splice mount folders 6054 mount_folders = splice_config.get("mount", {}) 6055 6056 # Genome mount 6057 mount_folders[ 6058 config.get("folders", {}) 6059 .get("databases", {}) 6060 .get("genomes", DEFAULT_GENOME_FOLDER) 6061 ] = "ro" 6062 6063 # SpliceAI mount 6064 mount_folders[ 6065 config.get("folders", {}) 6066 .get("databases", {}) 6067 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6068 ] = "ro" 6069 6070 # Genome mount 6071 mount_folders[ 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spip", DEFAULT_SPIP_FOLDER) 6075 ] = "ro" 6076 6077 # Mount folders 6078 mount = [] 6079 6080 # Config mount 6081 mount = [ 6082 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6083 for path, mode in mount_folders.items() 6084 ] 6085 6086 if any(value for value in splice_config.values() if value is None): 6087 log.warning("At least one splice config parameter is empty") 6088 return None 6089 6090 # Params in splice nf 6091 def check_values(dico: dict): 6092 """ 6093 Ensure parameters for NF splice pipeline 6094 """ 6095 for key, val in dico.items(): 6096 if key == "genome": 6097 if any( 6098 assemb in options.get("genome", {}) 6099 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6100 ): 6101 yield f"--{key} hg19" 6102 elif any( 6103 assemb in options.get("genome", {}) 6104 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6105 ): 6106 yield f"--{key} hg38" 6107 elif ( 6108 (isinstance(val, str) and val) 6109 or isinstance(val, int) 6110 or isinstance(val, bool) 6111 ): 6112 yield f"--{key} {val}" 6113 6114 # Genome 6115 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6116 options["genome"] = genome 6117 6118 # NF params 6119 nf_params = [] 6120 6121 # Add options 6122 if options: 6123 nf_params = list(check_values(options)) 6124 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6125 else: 6126 log.debug("No NF params provided") 6127 6128 # Add threads 6129 if "threads" not in options.keys(): 6130 nf_params.append(f"--threads {threads}") 6131 6132 # Genome path 6133 genome_path = find_genome( 6134 config.get("folders", {}) 6135 .get("databases", {}) 6136 .get("genomes", DEFAULT_GENOME_FOLDER), 6137 file=f"{genome}.fa", 6138 ) 6139 # Add genome path 6140 if not genome_path: 6141 raise ValueError( 6142 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6143 ) 6144 else: 6145 log.debug(f"Genome: {genome_path}") 6146 nf_params.append(f"--genome_path {genome_path}") 6147 6148 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6149 """ 6150 Setting up updated databases for SPiP and SpliceAI 6151 """ 6152 6153 try: 6154 6155 # SpliceAI assembly transcriptome 6156 spliceai_assembly = os.path.join( 6157 config.get("folders", {}) 6158 .get("databases", {}) 6159 .get("spliceai", {}), 6160 options.get("genome"), 6161 "transcriptome", 6162 ) 6163 spip_assembly = options.get("genome") 6164 6165 spip = find( 6166 f"transcriptome_{spip_assembly}.RData", 6167 config.get("folders", {}).get("databases", {}).get("spip", {}), 6168 ) 6169 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6170 log.debug(f"SPiP annotations: {spip}") 6171 log.debug(f"SpliceAI annotations: {spliceai}") 6172 if spip and spliceai: 6173 return [ 6174 f"--spip_transcriptome {spip}", 6175 f"--spliceai_annotations {spliceai}", 6176 ] 6177 else: 6178 # TODO crash and go on with basic annotations ? 6179 # raise ValueError( 6180 # "Can't find splice databases in configuration EXIT" 6181 # ) 6182 log.warning( 6183 "Can't find splice databases in configuration, use annotations file from image" 6184 ) 6185 except TypeError: 6186 log.warning( 6187 "Can't find splice databases in configuration, use annotations file from image" 6188 ) 6189 return [] 6190 6191 # Add options, check if transcriptome option have already beend provided 6192 if ( 6193 "spip_transcriptome" not in nf_params 6194 and "spliceai_transcriptome" not in nf_params 6195 ): 6196 splice_reference = splice_annotations(options, config) 6197 if splice_reference: 6198 nf_params.extend(splice_reference) 6199 6200 nf_params.append(f"--output_folder {output_folder}") 6201 6202 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6203 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6204 log.debug(cmd) 6205 6206 splice_config["docker"]["command"] = cmd 6207 6208 docker_cmd = get_bin_command( 6209 tool="splice", 6210 bin_type="docker", 6211 config=config, 6212 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6213 add_options=f"--name {random_uuid} {' '.join(mount)}", 6214 ) 6215 6216 # Docker debug 6217 # if splice_config.get("rm_container"): 6218 # rm_container = "--rm" 6219 # else: 6220 # rm_container = "" 6221 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6222 6223 log.debug(docker_cmd) 6224 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6225 log.debug(res.stdout) 6226 if res.stderr: 6227 log.error(res.stderr) 6228 res.check_returncode() 6229 else: 6230 log.warning(f"Splice tool configuration not found: {config}") 6231 6232 # Update variants 6233 log.info("Annotation - Updating...") 6234 # Test find output vcf 6235 log.debug( 6236 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6237 ) 6238 output_vcf = [] 6239 # Wrong folder to look in 6240 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6241 if ( 6242 files 6243 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6244 ): 6245 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6246 # log.debug(os.listdir(options.get("output_folder"))) 6247 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6248 if not output_vcf: 6249 log.debug( 6250 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6251 ) 6252 else: 6253 # Get new header from annotated vcf 6254 log.debug(f"Initial header: {len(header.infos)} fields") 6255 # Create new header with splice infos 6256 new_vcf = Variants(input=output_vcf[0]) 6257 new_vcf_header = new_vcf.get_header().infos 6258 for keys, infos in new_vcf_header.items(): 6259 if keys not in header.infos.keys(): 6260 header.infos[keys] = infos 6261 log.debug(f"New header: {len(header.infos)} fields") 6262 log.debug(f"Splice tmp output: {output_vcf[0]}") 6263 self.update_from_vcf(output_vcf[0]) 6264 6265 # Remove folder 6266 remove_if_exists(output_folder) 6267 6268 ### 6269 # Prioritization 6270 ### 6271 6272 def get_config_default(self, name: str) -> dict: 6273 """ 6274 The function `get_config_default` returns a dictionary containing default configurations for 6275 various calculations and prioritizations. 6276 6277 :param name: The `get_config_default` function returns a dictionary containing default 6278 configurations for different calculations and prioritizations. The `name` parameter is used to 6279 specify which specific configuration to retrieve from the dictionary 6280 :type name: str 6281 :return: The function `get_config_default` returns a dictionary containing default configuration 6282 settings for different calculations and prioritizations. The specific configuration settings are 6283 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6284 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6285 returned. If there is no match, an empty dictionary is returned. 6286 """ 6287 6288 config_default = { 6289 "calculations": { 6290 "variant_chr_pos_alt_ref": { 6291 "type": "sql", 6292 "name": "variant_chr_pos_alt_ref", 6293 "description": "Create a variant ID with chromosome, position, alt and ref", 6294 "available": False, 6295 "output_column_name": "variant_chr_pos_alt_ref", 6296 "output_column_type": "String", 6297 "output_column_description": "variant ID with chromosome, position, alt and ref", 6298 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6299 "operation_info": True, 6300 }, 6301 "VARTYPE": { 6302 "type": "sql", 6303 "name": "VARTYPE", 6304 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6305 "available": True, 6306 "output_column_name": "VARTYPE", 6307 "output_column_type": "String", 6308 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6309 "operation_query": """ 6310 CASE 6311 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6312 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6313 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6314 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6315 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6316 ELSE 'UNDEFINED' 6317 END 6318 """, 6319 "info_fields": ["SVTYPE"], 6320 "operation_info": True, 6321 }, 6322 "snpeff_hgvs": { 6323 "type": "python", 6324 "name": "snpeff_hgvs", 6325 "description": "HGVS nomenclatures from snpEff annotation", 6326 "available": True, 6327 "function_name": "calculation_extract_snpeff_hgvs", 6328 "function_params": ["snpeff_hgvs", "ANN"], 6329 }, 6330 "snpeff_ann_explode": { 6331 "type": "python", 6332 "name": "snpeff_ann_explode", 6333 "description": "Explode snpEff annotations with uniquify values", 6334 "available": True, 6335 "function_name": "calculation_snpeff_ann_explode", 6336 "function_params": [False, "fields", "snpeff_", "ANN"], 6337 }, 6338 "snpeff_ann_explode_uniquify": { 6339 "type": "python", 6340 "name": "snpeff_ann_explode_uniquify", 6341 "description": "Explode snpEff annotations", 6342 "available": True, 6343 "function_name": "calculation_snpeff_ann_explode", 6344 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6345 }, 6346 "snpeff_ann_explode_json": { 6347 "type": "python", 6348 "name": "snpeff_ann_explode_json", 6349 "description": "Explode snpEff annotations in JSON format", 6350 "available": True, 6351 "function_name": "calculation_snpeff_ann_explode", 6352 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6353 }, 6354 "NOMEN": { 6355 "type": "python", 6356 "name": "NOMEN", 6357 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6358 "available": True, 6359 "function_name": "calculation_extract_nomen", 6360 "function_params": [], 6361 }, 6362 "FINDBYPIPELINE": { 6363 "type": "python", 6364 "name": "FINDBYPIPELINE", 6365 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6366 "available": True, 6367 "function_name": "calculation_find_by_pipeline", 6368 "function_params": ["findbypipeline"], 6369 }, 6370 "FINDBYSAMPLE": { 6371 "type": "python", 6372 "name": "FINDBYSAMPLE", 6373 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6374 "available": True, 6375 "function_name": "calculation_find_by_pipeline", 6376 "function_params": ["findbysample"], 6377 }, 6378 "GENOTYPECONCORDANCE": { 6379 "type": "python", 6380 "name": "GENOTYPECONCORDANCE", 6381 "description": "Concordance of genotype for multi caller VCF", 6382 "available": True, 6383 "function_name": "calculation_genotype_concordance", 6384 "function_params": [], 6385 }, 6386 "BARCODE": { 6387 "type": "python", 6388 "name": "BARCODE", 6389 "description": "BARCODE as VaRank tool", 6390 "available": True, 6391 "function_name": "calculation_barcode", 6392 "function_params": [], 6393 }, 6394 "BARCODEFAMILY": { 6395 "type": "python", 6396 "name": "BARCODEFAMILY", 6397 "description": "BARCODEFAMILY as VaRank tool", 6398 "available": True, 6399 "function_name": "calculation_barcode_family", 6400 "function_params": ["BCF"], 6401 }, 6402 "TRIO": { 6403 "type": "python", 6404 "name": "TRIO", 6405 "description": "Inheritance for a trio family", 6406 "available": True, 6407 "function_name": "calculation_trio", 6408 "function_params": [], 6409 }, 6410 "VAF": { 6411 "type": "python", 6412 "name": "VAF", 6413 "description": "Variant Allele Frequency (VAF) harmonization", 6414 "available": True, 6415 "function_name": "calculation_vaf_normalization", 6416 "function_params": [], 6417 }, 6418 "VAF_stats": { 6419 "type": "python", 6420 "name": "VAF_stats", 6421 "description": "Variant Allele Frequency (VAF) statistics", 6422 "available": True, 6423 "function_name": "calculation_genotype_stats", 6424 "function_params": ["VAF"], 6425 }, 6426 "DP_stats": { 6427 "type": "python", 6428 "name": "DP_stats", 6429 "description": "Depth (DP) statistics", 6430 "available": True, 6431 "function_name": "calculation_genotype_stats", 6432 "function_params": ["DP"], 6433 }, 6434 "variant_id": { 6435 "type": "python", 6436 "name": "variant_id", 6437 "description": "Variant ID generated from variant position and type", 6438 "available": True, 6439 "function_name": "calculation_variant_id", 6440 "function_params": [], 6441 }, 6442 }, 6443 "prioritizations": { 6444 "default": { 6445 "filter": [ 6446 { 6447 "type": "notequals", 6448 "value": "!PASS|\\.", 6449 "score": 0, 6450 "flag": "FILTERED", 6451 "comment": ["Bad variant quality"], 6452 }, 6453 { 6454 "type": "equals", 6455 "value": "REJECT", 6456 "score": -20, 6457 "flag": "PASS", 6458 "comment": ["Bad variant quality"], 6459 }, 6460 ], 6461 "DP": [ 6462 { 6463 "type": "gte", 6464 "value": "50", 6465 "score": 5, 6466 "flag": "PASS", 6467 "comment": ["DP higher than 50"], 6468 } 6469 ], 6470 "ANN": [ 6471 { 6472 "type": "contains", 6473 "value": "HIGH", 6474 "score": 5, 6475 "flag": "PASS", 6476 "comment": [ 6477 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6478 ], 6479 }, 6480 { 6481 "type": "contains", 6482 "value": "MODERATE", 6483 "score": 3, 6484 "flag": "PASS", 6485 "comment": [ 6486 "A non-disruptive variant that might change protein effectiveness" 6487 ], 6488 }, 6489 { 6490 "type": "contains", 6491 "value": "LOW", 6492 "score": 0, 6493 "flag": "FILTERED", 6494 "comment": [ 6495 "Assumed to be mostly harmless or unlikely to change protein behavior" 6496 ], 6497 }, 6498 { 6499 "type": "contains", 6500 "value": "MODIFIER", 6501 "score": 0, 6502 "flag": "FILTERED", 6503 "comment": [ 6504 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6505 ], 6506 }, 6507 ], 6508 } 6509 }, 6510 } 6511 6512 return config_default.get(name, None) 6513 6514 def get_config_json( 6515 self, name: str, config_dict: dict = {}, config_file: str = None 6516 ) -> dict: 6517 """ 6518 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6519 default values, a dictionary, and a file. 6520 6521 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6522 the name of the configuration. It is used to identify and retrieve the configuration settings 6523 for a specific component or module 6524 :type name: str 6525 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6526 dictionary that allows you to provide additional configuration settings or overrides. When you 6527 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6528 the key is the configuration setting you want to override or 6529 :type config_dict: dict 6530 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6531 specify the path to a configuration file that contains additional settings. If provided, the 6532 function will read the contents of this file and update the configuration dictionary with the 6533 values found in the file, overriding any existing values with the 6534 :type config_file: str 6535 :return: The function `get_config_json` returns a dictionary containing the configuration 6536 settings. 6537 """ 6538 6539 # Create with default prioritizations 6540 config_default = self.get_config_default(name=name) 6541 configuration = config_default 6542 # log.debug(f"configuration={configuration}") 6543 6544 # Replace prioritizations from dict 6545 for config in config_dict: 6546 configuration[config] = config_dict[config] 6547 6548 # Replace prioritizations from file 6549 config_file = full_path(config_file) 6550 if config_file: 6551 if os.path.exists(config_file): 6552 with open(config_file) as config_file_content: 6553 config_file_dict = json.load(config_file_content) 6554 for config in config_file_dict: 6555 configuration[config] = config_file_dict[config] 6556 else: 6557 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6558 log.error(msg_error) 6559 raise ValueError(msg_error) 6560 6561 return configuration 6562 6563 def prioritization(self) -> None: 6564 """ 6565 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6566 INFO fields 6567 """ 6568 6569 # Config 6570 config = self.get_config() 6571 6572 # Param 6573 param = self.get_param() 6574 6575 # Quick Prioritizations 6576 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6577 6578 # Configuration profiles 6579 prioritization_config_file = param.get("prioritization", {}).get( 6580 "prioritization_config", None 6581 ) 6582 prioritization_config_file = full_path(prioritization_config_file) 6583 prioritizations_config = self.get_config_json( 6584 name="prioritizations", config_file=prioritization_config_file 6585 ) 6586 6587 # Prioritization options 6588 profiles = param.get("prioritization", {}).get("profiles", []) 6589 if isinstance(profiles, str): 6590 profiles = profiles.split(",") 6591 pzfields = param.get("prioritization", {}).get( 6592 "pzfields", ["PZFlag", "PZScore"] 6593 ) 6594 if isinstance(pzfields, str): 6595 pzfields = pzfields.split(",") 6596 default_profile = param.get("prioritization", {}).get("default_profile", None) 6597 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6598 prioritization_score_mode = param.get("prioritization", {}).get( 6599 "prioritization_score_mode", "HOWARD" 6600 ) 6601 6602 # Quick Prioritizations 6603 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6604 prioritizations = param.get("prioritizations", None) 6605 if prioritizations: 6606 log.info("Quick Prioritization:") 6607 for profile in prioritizations.split(","): 6608 if profile not in profiles: 6609 profiles.append(profile) 6610 log.info(f" {profile}") 6611 6612 # If profile "ALL" provided, all profiles in the config profiles 6613 if "ALL" in profiles: 6614 profiles = list(prioritizations_config.keys()) 6615 6616 for profile in profiles: 6617 if prioritizations_config.get(profile, None): 6618 log.debug(f"Profile '{profile}' configured") 6619 else: 6620 msg_error = f"Profile '{profile}' NOT configured" 6621 log.error(msg_error) 6622 raise ValueError(msg_error) 6623 6624 if profiles: 6625 log.info(f"Prioritization... ") 6626 else: 6627 log.debug(f"No profile defined") 6628 return 6629 6630 if not default_profile and len(profiles): 6631 default_profile = profiles[0] 6632 6633 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6634 log.debug("Profiles to check: " + str(list(profiles))) 6635 6636 # Variables 6637 table_variants = self.get_table_variants(clause="update") 6638 6639 # Added columns 6640 added_columns = [] 6641 6642 # Create list of PZfields 6643 # List of PZFields 6644 list_of_pzfields_original = pzfields + [ 6645 pzfield + pzfields_sep + profile 6646 for pzfield in pzfields 6647 for profile in profiles 6648 ] 6649 list_of_pzfields = [] 6650 log.debug(f"{list_of_pzfields_original}") 6651 6652 # Remove existing PZfields to use if exists 6653 for pzfield in list_of_pzfields_original: 6654 if self.get_header().infos.get(pzfield, None) is None: 6655 list_of_pzfields.append(pzfield) 6656 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6657 else: 6658 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6659 6660 if list_of_pzfields: 6661 6662 # Explode Infos fields 6663 explode_infos_prefix = self.get_explode_infos_prefix() 6664 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6665 extra_infos = self.get_extra_infos() 6666 6667 # PZfields tags description 6668 PZfields_INFOS = { 6669 "PZTags": { 6670 "ID": "PZTags", 6671 "Number": ".", 6672 "Type": "String", 6673 "Description": "Variant tags based on annotation criteria", 6674 }, 6675 "PZScore": { 6676 "ID": "PZScore", 6677 "Number": 1, 6678 "Type": "Integer", 6679 "Description": "Variant score based on annotation criteria", 6680 }, 6681 "PZFlag": { 6682 "ID": "PZFlag", 6683 "Number": 1, 6684 "Type": "String", 6685 "Description": "Variant flag based on annotation criteria", 6686 }, 6687 "PZComment": { 6688 "ID": "PZComment", 6689 "Number": ".", 6690 "Type": "String", 6691 "Description": "Variant comment based on annotation criteria", 6692 }, 6693 "PZInfos": { 6694 "ID": "PZInfos", 6695 "Number": ".", 6696 "Type": "String", 6697 "Description": "Variant infos based on annotation criteria", 6698 }, 6699 } 6700 6701 # Create INFO fields if not exist 6702 for field in PZfields_INFOS: 6703 field_ID = PZfields_INFOS[field]["ID"] 6704 field_description = PZfields_INFOS[field]["Description"] 6705 if field_ID not in self.get_header().infos and field_ID in pzfields: 6706 field_description = ( 6707 PZfields_INFOS[field]["Description"] 6708 + f", profile {default_profile}" 6709 ) 6710 self.get_header().infos[field_ID] = vcf.parser._Info( 6711 field_ID, 6712 PZfields_INFOS[field]["Number"], 6713 PZfields_INFOS[field]["Type"], 6714 field_description, 6715 "unknown", 6716 "unknown", 6717 code_type_map[PZfields_INFOS[field]["Type"]], 6718 ) 6719 6720 # Create INFO fields if not exist for each profile 6721 for profile in prioritizations_config: 6722 if profile in profiles or profiles == []: 6723 for field in PZfields_INFOS: 6724 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6725 field_description = ( 6726 PZfields_INFOS[field]["Description"] 6727 + f", profile {profile}" 6728 ) 6729 if ( 6730 field_ID not in self.get_header().infos 6731 and field in pzfields 6732 ): 6733 self.get_header().infos[field_ID] = vcf.parser._Info( 6734 field_ID, 6735 PZfields_INFOS[field]["Number"], 6736 PZfields_INFOS[field]["Type"], 6737 field_description, 6738 "unknown", 6739 "unknown", 6740 code_type_map[PZfields_INFOS[field]["Type"]], 6741 ) 6742 6743 # Header 6744 for pzfield in list_of_pzfields: 6745 if re.match("PZScore.*", pzfield): 6746 added_column = self.add_column( 6747 table_name=table_variants, 6748 column_name=pzfield, 6749 column_type="INTEGER", 6750 default_value="0", 6751 ) 6752 elif re.match("PZFlag.*", pzfield): 6753 added_column = self.add_column( 6754 table_name=table_variants, 6755 column_name=pzfield, 6756 column_type="BOOLEAN", 6757 default_value="1", 6758 ) 6759 else: 6760 added_column = self.add_column( 6761 table_name=table_variants, 6762 column_name=pzfield, 6763 column_type="STRING", 6764 default_value="''", 6765 ) 6766 added_columns.append(added_column) 6767 6768 # Profiles 6769 if profiles: 6770 6771 # foreach profile in configuration file 6772 for profile in prioritizations_config: 6773 6774 # If profile is asked in param, or ALL are asked (empty profile []) 6775 if profile in profiles or profiles == []: 6776 log.info(f"Profile '{profile}'") 6777 6778 sql_set_info_option = "" 6779 6780 sql_set_info = [] 6781 6782 # PZ fields set 6783 6784 # PZScore 6785 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6786 sql_set_info.append( 6787 f""" 6788 concat( 6789 'PZScore{pzfields_sep}{profile}=', 6790 PZScore{pzfields_sep}{profile} 6791 ) 6792 """ 6793 ) 6794 if ( 6795 profile == default_profile 6796 and "PZScore" in list_of_pzfields 6797 ): 6798 sql_set_info.append( 6799 f""" 6800 concat( 6801 'PZScore=', 6802 PZScore{pzfields_sep}{profile} 6803 ) 6804 """ 6805 ) 6806 6807 # PZFlag 6808 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6809 sql_set_info.append( 6810 f""" 6811 concat( 6812 'PZFlag{pzfields_sep}{profile}=', 6813 CASE 6814 WHEN PZFlag{pzfields_sep}{profile}==1 6815 THEN 'PASS' 6816 WHEN PZFlag{pzfields_sep}{profile}==0 6817 THEN 'FILTERED' 6818 END 6819 ) 6820 """ 6821 ) 6822 if ( 6823 profile == default_profile 6824 and "PZFlag" in list_of_pzfields 6825 ): 6826 sql_set_info.append( 6827 f""" 6828 concat( 6829 'PZFlag=', 6830 CASE 6831 WHEN PZFlag{pzfields_sep}{profile}==1 6832 THEN 'PASS' 6833 WHEN PZFlag{pzfields_sep}{profile}==0 6834 THEN 'FILTERED' 6835 END 6836 ) 6837 """ 6838 ) 6839 6840 # PZComment 6841 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6842 sql_set_info.append( 6843 f""" 6844 CASE 6845 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6846 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6847 ELSE '' 6848 END 6849 """ 6850 ) 6851 if ( 6852 profile == default_profile 6853 and "PZComment" in list_of_pzfields 6854 ): 6855 sql_set_info.append( 6856 f""" 6857 CASE 6858 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6859 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6860 ELSE '' 6861 END 6862 """ 6863 ) 6864 6865 # PZInfos 6866 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6867 sql_set_info.append( 6868 f""" 6869 CASE 6870 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6871 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6872 ELSE '' 6873 END 6874 """ 6875 ) 6876 if ( 6877 profile == default_profile 6878 and "PZInfos" in list_of_pzfields 6879 ): 6880 sql_set_info.append( 6881 f""" 6882 CASE 6883 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6884 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6885 ELSE '' 6886 END 6887 """ 6888 ) 6889 6890 # Merge PZfields 6891 sql_set_info_option = "" 6892 sql_set_sep = "" 6893 for sql_set in sql_set_info: 6894 if sql_set_sep: 6895 sql_set_info_option += f""" 6896 , concat('{sql_set_sep}', {sql_set}) 6897 """ 6898 else: 6899 sql_set_info_option += f""" 6900 , {sql_set} 6901 """ 6902 sql_set_sep = ";" 6903 6904 sql_queries = [] 6905 for annotation in prioritizations_config[profile]: 6906 6907 # Check if annotation field is present 6908 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6909 log.debug(f"Annotation '{annotation}' not in data") 6910 continue 6911 else: 6912 log.debug(f"Annotation '{annotation}' in data") 6913 6914 # For each criterions 6915 for criterion in prioritizations_config[profile][ 6916 annotation 6917 ]: 6918 criterion_type = criterion["type"] 6919 criterion_value = criterion["value"] 6920 criterion_score = criterion.get("score", 0) 6921 criterion_flag = criterion.get("flag", "PASS") 6922 criterion_flag_bool = criterion_flag == "PASS" 6923 criterion_comment = ( 6924 ", ".join(criterion.get("comment", [])) 6925 .replace("'", "''") 6926 .replace(";", ",") 6927 .replace("\t", " ") 6928 ) 6929 criterion_infos = ( 6930 str(criterion) 6931 .replace("'", "''") 6932 .replace(";", ",") 6933 .replace("\t", " ") 6934 ) 6935 6936 sql_set = [] 6937 sql_set_info = [] 6938 6939 # PZ fields set 6940 if ( 6941 f"PZScore{pzfields_sep}{profile}" 6942 in list_of_pzfields 6943 ): 6944 if prioritization_score_mode == "HOWARD": 6945 sql_set.append( 6946 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6947 ) 6948 elif prioritization_score_mode == "VaRank": 6949 sql_set.append( 6950 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6951 ) 6952 else: 6953 sql_set.append( 6954 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6955 ) 6956 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6957 sql_set.append( 6958 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6959 ) 6960 if ( 6961 f"PZComment{pzfields_sep}{profile}" 6962 in list_of_pzfields 6963 ): 6964 sql_set.append( 6965 f""" 6966 PZComment{pzfields_sep}{profile} = 6967 concat( 6968 PZComment{pzfields_sep}{profile}, 6969 CASE 6970 WHEN PZComment{pzfields_sep}{profile}!='' 6971 THEN ', ' 6972 ELSE '' 6973 END, 6974 '{criterion_comment}' 6975 ) 6976 """ 6977 ) 6978 if ( 6979 f"PZInfos{pzfields_sep}{profile}" 6980 in list_of_pzfields 6981 ): 6982 sql_set.append( 6983 f""" 6984 PZInfos{pzfields_sep}{profile} = 6985 concat( 6986 PZInfos{pzfields_sep}{profile}, 6987 '{criterion_infos}' 6988 ) 6989 """ 6990 ) 6991 sql_set_option = ",".join(sql_set) 6992 6993 # Criterion and comparison 6994 try: 6995 float(criterion_value) 6996 sql_update = f""" 6997 UPDATE {table_variants} 6998 SET {sql_set_option} 6999 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7000 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7001 """ 7002 except: 7003 contains_option = "" 7004 if criterion_type == "contains": 7005 contains_option = ".*" 7006 sql_update = f""" 7007 UPDATE {table_variants} 7008 SET {sql_set_option} 7009 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7010 """ 7011 sql_queries.append(sql_update) 7012 7013 # PZTags 7014 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7015 7016 # Create PZFalgs value 7017 pztags_value = "" 7018 pztags_sep_default = "|" 7019 pztags_sep = "" 7020 for pzfield in pzfields: 7021 if pzfield not in ["PZTags"]: 7022 if ( 7023 f"{pzfield}{pzfields_sep}{profile}" 7024 in list_of_pzfields 7025 ): 7026 if pzfield in ["PZFlag"]: 7027 pztags_value += f"""{pztags_sep}{pzfield}#', 7028 CASE WHEN PZFlag{pzfields_sep}{profile} 7029 THEN 'PASS' 7030 ELSE 'FILTERED' 7031 END, '""" 7032 else: 7033 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7034 pztags_sep = pztags_sep_default 7035 7036 # Add Query update for PZFlags 7037 sql_update_pztags = f""" 7038 UPDATE {table_variants} 7039 SET INFO = concat( 7040 INFO, 7041 CASE WHEN INFO NOT in ('','.') 7042 THEN ';' 7043 ELSE '' 7044 END, 7045 'PZTags{pzfields_sep}{profile}={pztags_value}' 7046 ) 7047 """ 7048 sql_queries.append(sql_update_pztags) 7049 7050 # Add Query update for PZFlags for default 7051 if profile == default_profile: 7052 sql_update_pztags_default = f""" 7053 UPDATE {table_variants} 7054 SET INFO = concat( 7055 INFO, 7056 ';', 7057 'PZTags={pztags_value}' 7058 ) 7059 """ 7060 sql_queries.append(sql_update_pztags_default) 7061 7062 log.info(f"""Profile '{profile}' - Prioritization... """) 7063 7064 if sql_queries: 7065 7066 for sql_query in sql_queries: 7067 log.debug( 7068 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7069 ) 7070 self.conn.execute(sql_query) 7071 7072 log.info(f"""Profile '{profile}' - Update... """) 7073 sql_query_update = f""" 7074 UPDATE {table_variants} 7075 SET INFO = 7076 concat( 7077 CASE 7078 WHEN INFO NOT IN ('','.') 7079 THEN concat(INFO, ';') 7080 ELSE '' 7081 END 7082 {sql_set_info_option} 7083 ) 7084 """ 7085 self.conn.execute(sql_query_update) 7086 7087 else: 7088 7089 log.warning(f"No profiles in parameters") 7090 7091 # Remove added columns 7092 for added_column in added_columns: 7093 self.drop_column(column=added_column) 7094 7095 # Explode INFOS fields into table fields 7096 if self.get_explode_infos(): 7097 self.explode_infos( 7098 prefix=self.get_explode_infos_prefix(), 7099 fields=self.get_explode_infos_fields(), 7100 force=True, 7101 ) 7102 7103 return 7104 7105 ### 7106 # HGVS 7107 ### 7108 7109 def annotation_hgvs(self, threads: int = None) -> None: 7110 """ 7111 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7112 coordinates and alleles. 7113 7114 :param threads: The `threads` parameter is an optional integer that specifies the number of 7115 threads to use for parallel processing. If no value is provided, it will default to the number 7116 of threads obtained from the `get_threads()` method 7117 :type threads: int 7118 """ 7119 7120 # Function for each partition of the Dask Dataframe 7121 def partition_function(partition): 7122 """ 7123 The function `partition_function` applies the `annotation_hgvs_partition` function to 7124 each row of a DataFrame called `partition`. 7125 7126 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7127 to be processed 7128 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7129 the "partition" dataframe along the axis 1. 7130 """ 7131 return partition.apply(annotation_hgvs_partition, axis=1) 7132 7133 def annotation_hgvs_partition(row) -> str: 7134 """ 7135 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7136 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7137 7138 :param row: A dictionary-like object that contains the values for the following keys: 7139 :return: a string that contains the HGVS names associated with the given row of data. 7140 """ 7141 7142 chr = row["CHROM"] 7143 pos = row["POS"] 7144 ref = row["REF"] 7145 alt = row["ALT"] 7146 7147 # Find list of associated transcripts 7148 transcripts_list = list( 7149 polars_conn.execute( 7150 f""" 7151 SELECT transcript 7152 FROM refseq_df 7153 WHERE CHROM='{chr}' 7154 AND POS={pos} 7155 """ 7156 )["transcript"] 7157 ) 7158 7159 # Full HGVS annotation in list 7160 hgvs_full_list = [] 7161 7162 for transcript_name in transcripts_list: 7163 7164 # Transcript 7165 transcript = get_transcript( 7166 transcripts=transcripts, transcript_name=transcript_name 7167 ) 7168 # Exon 7169 if use_exon: 7170 exon = transcript.find_exon_number(pos) 7171 else: 7172 exon = None 7173 # Protein 7174 transcript_protein = None 7175 if use_protein or add_protein or full_format: 7176 transcripts_protein = list( 7177 polars_conn.execute( 7178 f""" 7179 SELECT protein 7180 FROM refseqlink_df 7181 WHERE transcript='{transcript_name}' 7182 LIMIT 1 7183 """ 7184 )["protein"] 7185 ) 7186 if len(transcripts_protein): 7187 transcript_protein = transcripts_protein[0] 7188 7189 # HGVS name 7190 hgvs_name = format_hgvs_name( 7191 chr, 7192 pos, 7193 ref, 7194 alt, 7195 genome=genome, 7196 transcript=transcript, 7197 transcript_protein=transcript_protein, 7198 exon=exon, 7199 use_gene=use_gene, 7200 use_protein=use_protein, 7201 full_format=full_format, 7202 use_version=use_version, 7203 codon_type=codon_type, 7204 ) 7205 hgvs_full_list.append(hgvs_name) 7206 if add_protein and not use_protein and not full_format: 7207 hgvs_name = format_hgvs_name( 7208 chr, 7209 pos, 7210 ref, 7211 alt, 7212 genome=genome, 7213 transcript=transcript, 7214 transcript_protein=transcript_protein, 7215 exon=exon, 7216 use_gene=use_gene, 7217 use_protein=True, 7218 full_format=False, 7219 use_version=use_version, 7220 codon_type=codon_type, 7221 ) 7222 hgvs_full_list.append(hgvs_name) 7223 7224 # Create liste of HGVS annotations 7225 hgvs_full = ",".join(hgvs_full_list) 7226 7227 return hgvs_full 7228 7229 # Polars connexion 7230 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7231 7232 # Config 7233 config = self.get_config() 7234 7235 # Databases 7236 # Genome 7237 databases_genomes_folders = ( 7238 config.get("folders", {}) 7239 .get("databases", {}) 7240 .get("genomes", DEFAULT_GENOME_FOLDER) 7241 ) 7242 databases_genome = ( 7243 config.get("folders", {}).get("databases", {}).get("genomes", "") 7244 ) 7245 # refseq database folder 7246 databases_refseq_folders = ( 7247 config.get("folders", {}) 7248 .get("databases", {}) 7249 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7250 ) 7251 # refseq 7252 databases_refseq = config.get("databases", {}).get("refSeq", None) 7253 # refSeqLink 7254 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7255 7256 # Param 7257 param = self.get_param() 7258 7259 # Quick HGVS 7260 if "hgvs_options" in param and param.get("hgvs_options", ""): 7261 log.info(f"Quick HGVS Annotation:") 7262 if not param.get("hgvs", None): 7263 param["hgvs"] = {} 7264 for option in param.get("hgvs_options", "").split(","): 7265 option_var_val = option.split("=") 7266 option_var = option_var_val[0] 7267 if len(option_var_val) > 1: 7268 option_val = option_var_val[1] 7269 else: 7270 option_val = "True" 7271 if option_val.upper() in ["TRUE"]: 7272 option_val = True 7273 elif option_val.upper() in ["FALSE"]: 7274 option_val = False 7275 log.info(f" {option_var}={option_val}") 7276 param["hgvs"][option_var] = option_val 7277 7278 # Check if HGVS annotation enabled 7279 if "hgvs" in param: 7280 log.info(f"HGVS Annotation... ") 7281 for hgvs_option in param.get("hgvs", {}): 7282 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7283 else: 7284 return 7285 7286 # HGVS Param 7287 param_hgvs = param.get("hgvs", {}) 7288 use_exon = param_hgvs.get("use_exon", False) 7289 use_gene = param_hgvs.get("use_gene", False) 7290 use_protein = param_hgvs.get("use_protein", False) 7291 add_protein = param_hgvs.get("add_protein", False) 7292 full_format = param_hgvs.get("full_format", False) 7293 use_version = param_hgvs.get("use_version", False) 7294 codon_type = param_hgvs.get("codon_type", "3") 7295 7296 # refSseq refSeqLink 7297 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7298 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7299 7300 # Assembly 7301 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7302 7303 # Genome 7304 genome_file = None 7305 if find_genome(databases_genome): 7306 genome_file = find_genome(databases_genome) 7307 else: 7308 genome_file = find_genome( 7309 genome_path=databases_genomes_folders, assembly=assembly 7310 ) 7311 log.debug("Genome: " + str(genome_file)) 7312 7313 # refSseq 7314 refseq_file = find_file_prefix( 7315 input_file=databases_refseq, 7316 prefix="ncbiRefSeq", 7317 folder=databases_refseq_folders, 7318 assembly=assembly, 7319 ) 7320 log.debug("refSeq: " + str(refseq_file)) 7321 7322 # refSeqLink 7323 refseqlink_file = find_file_prefix( 7324 input_file=databases_refseqlink, 7325 prefix="ncbiRefSeqLink", 7326 folder=databases_refseq_folders, 7327 assembly=assembly, 7328 ) 7329 log.debug("refSeqLink: " + str(refseqlink_file)) 7330 7331 # Threads 7332 if not threads: 7333 threads = self.get_threads() 7334 log.debug("Threads: " + str(threads)) 7335 7336 # Variables 7337 table_variants = self.get_table_variants(clause="update") 7338 7339 # Get variants SNV and InDel only 7340 query_variants = f""" 7341 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7342 FROM {table_variants} 7343 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7344 """ 7345 df_variants = self.get_query_to_df(query_variants) 7346 7347 # Added columns 7348 added_columns = [] 7349 7350 # Add hgvs column in variants table 7351 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7352 added_column = self.add_column( 7353 table_variants, hgvs_column_name, "STRING", default_value=None 7354 ) 7355 added_columns.append(added_column) 7356 7357 log.debug(f"refSeq loading...") 7358 # refSeq in duckDB 7359 refseq_table = get_refseq_table( 7360 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7361 ) 7362 # Loading all refSeq in Dataframe 7363 refseq_query = f""" 7364 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7365 FROM {refseq_table} 7366 JOIN df_variants ON ( 7367 {refseq_table}.chrom = df_variants.CHROM 7368 AND {refseq_table}.txStart<=df_variants.POS 7369 AND {refseq_table}.txEnd>=df_variants.POS 7370 ) 7371 """ 7372 refseq_df = self.conn.query(refseq_query).pl() 7373 7374 if refseqlink_file: 7375 log.debug(f"refSeqLink loading...") 7376 # refSeqLink in duckDB 7377 refseqlink_table = get_refseq_table( 7378 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7379 ) 7380 # Loading all refSeqLink in Dataframe 7381 protacc_column = "protAcc_with_ver" 7382 mrnaacc_column = "mrnaAcc_with_ver" 7383 refseqlink_query = f""" 7384 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7385 FROM {refseqlink_table} 7386 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7387 WHERE protAcc_without_ver IS NOT NULL 7388 """ 7389 # Polars Dataframe 7390 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7391 7392 # Read RefSeq transcripts into a python dict/model. 7393 log.debug(f"Transcripts loading...") 7394 with tempfile.TemporaryDirectory() as tmpdir: 7395 transcripts_query = f""" 7396 COPY ( 7397 SELECT {refseq_table}.* 7398 FROM {refseq_table} 7399 JOIN df_variants ON ( 7400 {refseq_table}.chrom=df_variants.CHROM 7401 AND {refseq_table}.txStart<=df_variants.POS 7402 AND {refseq_table}.txEnd>=df_variants.POS 7403 ) 7404 ) 7405 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7406 """ 7407 self.conn.query(transcripts_query) 7408 with open(f"{tmpdir}/transcript.tsv") as infile: 7409 transcripts = read_transcripts(infile) 7410 7411 # Polars connexion 7412 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7413 7414 log.debug("Genome loading...") 7415 # Read genome sequence using pyfaidx. 7416 genome = Fasta(genome_file) 7417 7418 log.debug("Start annotation HGVS...") 7419 7420 # Create 7421 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7422 ddf = dd.from_pandas(df_variants, npartitions=threads) 7423 7424 # Use dask.dataframe.apply() to apply function on each partition 7425 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7426 7427 # Convert Dask DataFrame to Pandas Dataframe 7428 df = ddf.compute() 7429 7430 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7431 with tempfile.TemporaryDirectory() as tmpdir: 7432 df_parquet = os.path.join(tmpdir, "df.parquet") 7433 df.to_parquet(df_parquet) 7434 7435 # Update hgvs column 7436 update_variant_query = f""" 7437 UPDATE {table_variants} 7438 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7439 FROM read_parquet('{df_parquet}') as df 7440 WHERE variants."#CHROM" = df.CHROM 7441 AND variants.POS = df.POS 7442 AND variants.REF = df.REF 7443 AND variants.ALT = df.ALT 7444 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7445 """ 7446 self.execute_query(update_variant_query) 7447 7448 # Update INFO column 7449 sql_query_update = f""" 7450 UPDATE {table_variants} 7451 SET INFO = 7452 concat( 7453 CASE 7454 WHEN INFO NOT IN ('','.') 7455 THEN concat(INFO, ';') 7456 ELSE '' 7457 END, 7458 'hgvs=', 7459 {hgvs_column_name} 7460 ) 7461 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7462 """ 7463 self.execute_query(sql_query_update) 7464 7465 # Add header 7466 HGVS_INFOS = { 7467 "hgvs": { 7468 "ID": "hgvs", 7469 "Number": ".", 7470 "Type": "String", 7471 "Description": f"HGVS annotatation with HOWARD", 7472 } 7473 } 7474 7475 for field in HGVS_INFOS: 7476 field_ID = HGVS_INFOS[field]["ID"] 7477 field_description = HGVS_INFOS[field]["Description"] 7478 self.get_header().infos[field_ID] = vcf.parser._Info( 7479 field_ID, 7480 HGVS_INFOS[field]["Number"], 7481 HGVS_INFOS[field]["Type"], 7482 field_description, 7483 "unknown", 7484 "unknown", 7485 code_type_map[HGVS_INFOS[field]["Type"]], 7486 ) 7487 7488 # Remove added columns 7489 for added_column in added_columns: 7490 self.drop_column(column=added_column) 7491 7492 ### 7493 # Calculation 7494 ### 7495 7496 def get_operations_help( 7497 self, operations_config_dict: dict = {}, operations_config_file: str = None 7498 ) -> list: 7499 7500 # Init 7501 operations_help = [] 7502 7503 # operations 7504 operations = self.get_config_json( 7505 name="calculations", 7506 config_dict=operations_config_dict, 7507 config_file=operations_config_file, 7508 ) 7509 for op in operations: 7510 op_name = operations[op].get("name", op).upper() 7511 op_description = operations[op].get("description", op_name) 7512 op_available = operations[op].get("available", False) 7513 if op_available: 7514 operations_help.append(f" {op_name}: {op_description}") 7515 7516 # Sort operations 7517 operations_help.sort() 7518 7519 # insert header 7520 operations_help.insert(0, "Available calculation operations:") 7521 7522 # Return 7523 return operations_help 7524 7525 def calculation( 7526 self, 7527 operations: dict = {}, 7528 operations_config_dict: dict = {}, 7529 operations_config_file: str = None, 7530 ) -> None: 7531 """ 7532 It takes a list of operations, and for each operation, it checks if it's a python or sql 7533 operation, and then calls the appropriate function 7534 7535 param json example: 7536 "calculation": { 7537 "NOMEN": { 7538 "options": { 7539 "hgvs_field": "hgvs" 7540 }, 7541 "middle" : null 7542 } 7543 """ 7544 7545 # Param 7546 param = self.get_param() 7547 7548 # operations config 7549 operations_config = self.get_config_json( 7550 name="calculations", 7551 config_dict=operations_config_dict, 7552 config_file=operations_config_file, 7553 ) 7554 7555 # Upper keys 7556 operations_config = {k.upper(): v for k, v in operations_config.items()} 7557 7558 # Calculations 7559 7560 # Operations from param 7561 operations = param.get("calculation", {}).get("calculations", operations) 7562 7563 # Quick calculation - add 7564 if param.get("calculations", None): 7565 calculations_list = [ 7566 value for value in param.get("calculations", "").split(",") 7567 ] 7568 log.info(f"Quick Calculations:") 7569 for calculation_key in calculations_list: 7570 log.info(f" {calculation_key}") 7571 for calculation_operation in calculations_list: 7572 if calculation_operation.upper() not in operations: 7573 operations[calculation_operation.upper()] = {} 7574 add_value_into_dict( 7575 dict_tree=param, 7576 sections=[ 7577 "calculation", 7578 "calculations", 7579 calculation_operation.upper(), 7580 ], 7581 value={}, 7582 ) 7583 7584 # Operations for calculation 7585 if not operations: 7586 operations = param.get("calculation", {}).get("calculations", {}) 7587 7588 if operations: 7589 log.info(f"Calculations...") 7590 7591 # For each operations 7592 for operation_name in operations: 7593 operation_name = operation_name.upper() 7594 if operation_name not in [""]: 7595 if operation_name in operations_config: 7596 log.info(f"Calculation '{operation_name}'") 7597 operation = operations_config[operation_name] 7598 operation_type = operation.get("type", "sql") 7599 if operation_type == "python": 7600 self.calculation_process_function( 7601 operation=operation, operation_name=operation_name 7602 ) 7603 elif operation_type == "sql": 7604 self.calculation_process_sql( 7605 operation=operation, operation_name=operation_name 7606 ) 7607 else: 7608 log.error( 7609 f"Operations config: Type '{operation_type}' NOT available" 7610 ) 7611 raise ValueError( 7612 f"Operations config: Type '{operation_type}' NOT available" 7613 ) 7614 else: 7615 log.error( 7616 f"Operations config: Calculation '{operation_name}' NOT available" 7617 ) 7618 raise ValueError( 7619 f"Operations config: Calculation '{operation_name}' NOT available" 7620 ) 7621 7622 # Explode INFOS fields into table fields 7623 if self.get_explode_infos(): 7624 self.explode_infos( 7625 prefix=self.get_explode_infos_prefix(), 7626 fields=self.get_explode_infos_fields(), 7627 force=True, 7628 ) 7629 7630 def calculation_process_sql( 7631 self, operation: dict, operation_name: str = "unknown" 7632 ) -> None: 7633 """ 7634 The `calculation_process_sql` function takes in a mathematical operation as a string and 7635 performs the operation, updating the specified table with the result. 7636 7637 :param operation: The `operation` parameter is a dictionary that contains information about the 7638 mathematical operation to be performed. It includes the following keys: 7639 :type operation: dict 7640 :param operation_name: The `operation_name` parameter is a string that represents the name of 7641 the mathematical operation being performed. It is used for logging and error handling purposes, 7642 defaults to unknown 7643 :type operation_name: str (optional) 7644 """ 7645 7646 # table variants 7647 table_variants = self.get_table_variants(clause="alter") 7648 7649 # Operation infos 7650 operation_name = operation.get("name", "unknown") 7651 log.debug(f"process sql {operation_name}") 7652 output_column_name = operation.get("output_column_name", operation_name) 7653 output_column_type = operation.get("output_column_type", "String") 7654 prefix = operation.get("explode_infos_prefix", "") 7655 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7656 output_column_description = operation.get( 7657 "output_column_description", f"{operation_name} operation" 7658 ) 7659 operation_query = operation.get("operation_query", None) 7660 if isinstance(operation_query, list): 7661 operation_query = " ".join(operation_query) 7662 operation_info_fields = operation.get("info_fields", []) 7663 operation_info_fields_check = operation.get("info_fields_check", False) 7664 operation_info = operation.get("operation_info", True) 7665 7666 if operation_query: 7667 7668 # Info fields check 7669 operation_info_fields_check_result = True 7670 if operation_info_fields_check: 7671 header_infos = self.get_header().infos 7672 for info_field in operation_info_fields: 7673 operation_info_fields_check_result = ( 7674 operation_info_fields_check_result 7675 and info_field in header_infos 7676 ) 7677 7678 # If info fields available 7679 if operation_info_fields_check_result: 7680 7681 # Added_columns 7682 added_columns = [] 7683 7684 # Create VCF header field 7685 vcf_reader = self.get_header() 7686 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7687 output_column_name, 7688 ".", 7689 output_column_type, 7690 output_column_description, 7691 "howard calculation", 7692 "0", 7693 self.code_type_map.get(output_column_type), 7694 ) 7695 7696 # Explode infos if needed 7697 log.debug(f"calculation_process_sql prefix {prefix}") 7698 added_columns += self.explode_infos( 7699 prefix=prefix, 7700 fields=[output_column_name] + operation_info_fields, 7701 force=True, 7702 ) 7703 7704 # Create column 7705 added_column = self.add_column( 7706 table_name=table_variants, 7707 column_name=prefix + output_column_name, 7708 column_type=output_column_type_sql, 7709 default_value="null", 7710 ) 7711 added_columns.append(added_column) 7712 7713 # Operation calculation 7714 try: 7715 7716 # Query to update calculation column 7717 sql_update = f""" 7718 UPDATE {table_variants} 7719 SET "{prefix}{output_column_name}" = ({operation_query}) 7720 """ 7721 self.conn.execute(sql_update) 7722 7723 # Add to INFO 7724 if operation_info: 7725 sql_update_info = f""" 7726 UPDATE {table_variants} 7727 SET "INFO" = 7728 concat( 7729 CASE 7730 WHEN "INFO" IS NOT NULL 7731 THEN concat("INFO", ';') 7732 ELSE '' 7733 END, 7734 '{output_column_name}=', 7735 "{prefix}{output_column_name}" 7736 ) 7737 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7738 """ 7739 self.conn.execute(sql_update_info) 7740 7741 except: 7742 log.error( 7743 f"Operations config: Calculation '{operation_name}' query failed" 7744 ) 7745 raise ValueError( 7746 f"Operations config: Calculation '{operation_name}' query failed" 7747 ) 7748 7749 # Remove added columns 7750 for added_column in added_columns: 7751 log.debug(f"added_column: {added_column}") 7752 self.drop_column(column=added_column) 7753 7754 else: 7755 log.error( 7756 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7757 ) 7758 raise ValueError( 7759 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7760 ) 7761 7762 else: 7763 log.error( 7764 f"Operations config: Calculation '{operation_name}' query NOT defined" 7765 ) 7766 raise ValueError( 7767 f"Operations config: Calculation '{operation_name}' query NOT defined" 7768 ) 7769 7770 def calculation_process_function( 7771 self, operation: dict, operation_name: str = "unknown" 7772 ) -> None: 7773 """ 7774 The `calculation_process_function` takes in an operation dictionary and performs the specified 7775 function with the given parameters. 7776 7777 :param operation: The `operation` parameter is a dictionary that contains information about the 7778 operation to be performed. It has the following keys: 7779 :type operation: dict 7780 :param operation_name: The `operation_name` parameter is a string that represents the name of 7781 the operation being performed. It is used for logging purposes, defaults to unknown 7782 :type operation_name: str (optional) 7783 """ 7784 7785 operation_name = operation["name"] 7786 log.debug(f"process sql {operation_name}") 7787 function_name = operation["function_name"] 7788 function_params = operation["function_params"] 7789 getattr(self, function_name)(*function_params) 7790 7791 def calculation_variant_id(self) -> None: 7792 """ 7793 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7794 updates the INFO field of a variants table with the variant ID. 7795 """ 7796 7797 # variant_id annotation field 7798 variant_id_tag = self.get_variant_id_column() 7799 added_columns = [variant_id_tag] 7800 7801 # variant_id hgvs tags" 7802 vcf_infos_tags = { 7803 variant_id_tag: "howard variant ID annotation", 7804 } 7805 7806 # Variants table 7807 table_variants = self.get_table_variants() 7808 7809 # Header 7810 vcf_reader = self.get_header() 7811 7812 # Add variant_id to header 7813 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7814 variant_id_tag, 7815 ".", 7816 "String", 7817 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7818 "howard calculation", 7819 "0", 7820 self.code_type_map.get("String"), 7821 ) 7822 7823 # Update 7824 sql_update = f""" 7825 UPDATE {table_variants} 7826 SET "INFO" = 7827 concat( 7828 CASE 7829 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7830 THEN '' 7831 ELSE concat("INFO", ';') 7832 END, 7833 '{variant_id_tag}=', 7834 "{variant_id_tag}" 7835 ) 7836 """ 7837 self.conn.execute(sql_update) 7838 7839 # Remove added columns 7840 for added_column in added_columns: 7841 self.drop_column(column=added_column) 7842 7843 def calculation_extract_snpeff_hgvs( 7844 self, 7845 snpeff_hgvs: str = "snpeff_hgvs", 7846 snpeff_field: str = "ANN", 7847 ) -> None: 7848 """ 7849 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7850 annotation field in a VCF file and adds them as a new column in the variants table. 7851 7852 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7853 function is used to specify the name of the column that will store the HGVS nomenclatures 7854 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7855 snpeff_hgvs 7856 :type snpeff_hgvs: str (optional) 7857 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7858 function represents the field in the VCF file that contains SnpEff annotations. This field is 7859 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7860 to ANN 7861 :type snpeff_field: str (optional) 7862 """ 7863 7864 # Snpeff hgvs tags 7865 vcf_infos_tags = { 7866 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7867 } 7868 7869 # Prefix 7870 prefix = self.get_explode_infos_prefix() 7871 if prefix: 7872 prefix = "INFO/" 7873 7874 # snpEff fields 7875 speff_ann_infos = prefix + snpeff_field 7876 speff_hgvs_infos = prefix + snpeff_hgvs 7877 7878 # Variants table 7879 table_variants = self.get_table_variants() 7880 7881 # Header 7882 vcf_reader = self.get_header() 7883 7884 # Add columns 7885 added_columns = [] 7886 7887 # Explode HGVS field in column 7888 added_columns += self.explode_infos(fields=[snpeff_field]) 7889 7890 if snpeff_field in vcf_reader.infos: 7891 7892 log.debug(vcf_reader.infos[snpeff_field]) 7893 7894 # Extract ANN header 7895 ann_description = vcf_reader.infos[snpeff_field].desc 7896 pattern = r"'(.+?)'" 7897 match = re.search(pattern, ann_description) 7898 if match: 7899 ann_header_match = match.group(1).split(" | ") 7900 ann_header_desc = {} 7901 for i in range(len(ann_header_match)): 7902 ann_header_info = "".join( 7903 char for char in ann_header_match[i] if char.isalnum() 7904 ) 7905 ann_header_desc[ann_header_info] = ann_header_match[i] 7906 if not ann_header_desc: 7907 raise ValueError("Invalid header description format") 7908 else: 7909 raise ValueError("Invalid header description format") 7910 7911 # Create variant id 7912 variant_id_column = self.get_variant_id_column() 7913 added_columns += [variant_id_column] 7914 7915 # Create dataframe 7916 dataframe_snpeff_hgvs = self.get_query_to_df( 7917 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7918 ) 7919 7920 # Create main NOMEN column 7921 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7922 speff_ann_infos 7923 ].apply( 7924 lambda x: extract_snpeff_hgvs( 7925 str(x), header=list(ann_header_desc.values()) 7926 ) 7927 ) 7928 7929 # Add snpeff_hgvs to header 7930 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7931 snpeff_hgvs, 7932 ".", 7933 "String", 7934 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7935 "howard calculation", 7936 "0", 7937 self.code_type_map.get("String"), 7938 ) 7939 7940 # Update 7941 sql_update = f""" 7942 UPDATE variants 7943 SET "INFO" = 7944 concat( 7945 CASE 7946 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7947 THEN '' 7948 ELSE concat("INFO", ';') 7949 END, 7950 CASE 7951 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7952 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7953 THEN concat( 7954 '{snpeff_hgvs}=', 7955 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7956 ) 7957 ELSE '' 7958 END 7959 ) 7960 FROM dataframe_snpeff_hgvs 7961 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7962 7963 """ 7964 self.conn.execute(sql_update) 7965 7966 # Delete dataframe 7967 del dataframe_snpeff_hgvs 7968 gc.collect() 7969 7970 else: 7971 7972 log.warning( 7973 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7974 ) 7975 7976 # Remove added columns 7977 for added_column in added_columns: 7978 self.drop_column(column=added_column) 7979 7980 def calculation_snpeff_ann_explode( 7981 self, 7982 uniquify: bool = True, 7983 output_format: str = "fields", 7984 output_prefix: str = "snpeff_", 7985 snpeff_field: str = "ANN", 7986 ) -> None: 7987 """ 7988 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7989 exploding the HGVS field and updating variant information accordingly. 7990 7991 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 7992 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 7993 it indicates that the output should be unique, meaning that duplicate entries should be removed, 7994 defaults to True 7995 :type uniquify: bool (optional) 7996 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 7997 function specifies the format in which the output annotations will be generated. It has a 7998 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 7999 format, defaults to fields 8000 :type output_format: str (optional) 8001 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8002 method is used to specify the prefix that will be added to the output annotations generated 8003 during the calculation process. This prefix helps to differentiate the newly added annotations 8004 from existing ones in the output data. By default, the, defaults to ANN_ 8005 :type output_prefix: str (optional) 8006 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8007 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8008 field will be processed to explode the HGVS annotations and update the variant information 8009 accordingly, defaults to ANN 8010 :type snpeff_field: str (optional) 8011 """ 8012 8013 # SnpEff annotation field 8014 snpeff_hgvs = "snpeff_ann_explode" 8015 8016 # Snpeff hgvs tags 8017 vcf_infos_tags = { 8018 snpeff_hgvs: "Explode snpEff annotations", 8019 } 8020 8021 # Prefix 8022 prefix = self.get_explode_infos_prefix() 8023 if prefix: 8024 prefix = "INFO/" 8025 8026 # snpEff fields 8027 speff_ann_infos = prefix + snpeff_field 8028 speff_hgvs_infos = prefix + snpeff_hgvs 8029 8030 # Variants table 8031 table_variants = self.get_table_variants() 8032 8033 # Header 8034 vcf_reader = self.get_header() 8035 8036 # Add columns 8037 added_columns = [] 8038 8039 # Explode HGVS field in column 8040 added_columns += self.explode_infos(fields=[snpeff_field]) 8041 log.debug(f"snpeff_field={snpeff_field}") 8042 log.debug(f"added_columns={added_columns}") 8043 8044 if snpeff_field in vcf_reader.infos: 8045 8046 # Extract ANN header 8047 ann_description = vcf_reader.infos[snpeff_field].desc 8048 pattern = r"'(.+?)'" 8049 match = re.search(pattern, ann_description) 8050 if match: 8051 ann_header_match = match.group(1).split(" | ") 8052 ann_header = [] 8053 ann_header_desc = {} 8054 for i in range(len(ann_header_match)): 8055 ann_header_info = "".join( 8056 char for char in ann_header_match[i] if char.isalnum() 8057 ) 8058 ann_header.append(ann_header_info) 8059 ann_header_desc[ann_header_info] = ann_header_match[i] 8060 if not ann_header_desc: 8061 raise ValueError("Invalid header description format") 8062 else: 8063 raise ValueError("Invalid header description format") 8064 8065 # Create variant id 8066 variant_id_column = self.get_variant_id_column() 8067 added_columns += [variant_id_column] 8068 8069 # Create dataframe 8070 dataframe_snpeff_hgvs = self.get_query_to_df( 8071 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8072 ) 8073 8074 # Create snpEff columns 8075 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8076 speff_ann_infos 8077 ].apply( 8078 lambda x: explode_snpeff_ann( 8079 str(x), 8080 uniquify=uniquify, 8081 output_format=output_format, 8082 prefix=output_prefix, 8083 header=list(ann_header_desc.values()), 8084 ) 8085 ) 8086 8087 # Header 8088 ann_annotations_prefix = "" 8089 if output_format.upper() in ["JSON"]: 8090 ann_annotations_prefix = f"{output_prefix}=" 8091 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8092 output_prefix, 8093 ".", 8094 "String", 8095 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8096 + " - JSON format", 8097 "howard calculation", 8098 "0", 8099 self.code_type_map.get("String"), 8100 ) 8101 else: 8102 for ann_annotation in ann_header: 8103 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8104 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8105 ann_annotation_id, 8106 ".", 8107 "String", 8108 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8109 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8110 "howard calculation", 8111 "0", 8112 self.code_type_map.get("String"), 8113 ) 8114 8115 # Update 8116 sql_update = f""" 8117 UPDATE variants 8118 SET "INFO" = 8119 concat( 8120 CASE 8121 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8122 THEN '' 8123 ELSE concat("INFO", ';') 8124 END, 8125 CASE 8126 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8127 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8128 THEN concat( 8129 '{ann_annotations_prefix}', 8130 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8131 ) 8132 ELSE '' 8133 END 8134 ) 8135 FROM dataframe_snpeff_hgvs 8136 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8137 8138 """ 8139 self.conn.execute(sql_update) 8140 8141 # Delete dataframe 8142 del dataframe_snpeff_hgvs 8143 gc.collect() 8144 8145 else: 8146 8147 log.warning( 8148 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8149 ) 8150 8151 # Remove added columns 8152 for added_column in added_columns: 8153 self.drop_column(column=added_column) 8154 8155 def calculation_extract_nomen(self) -> None: 8156 """ 8157 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8158 """ 8159 8160 # NOMEN field 8161 field_nomen_dict = "NOMEN_DICT" 8162 8163 # NOMEN structure 8164 nomen_dict = { 8165 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8166 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8167 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8168 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8169 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8170 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8171 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8172 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8173 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8174 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8175 } 8176 8177 # Param 8178 param = self.get_param() 8179 8180 # Prefix 8181 prefix = self.get_explode_infos_prefix() 8182 8183 # Header 8184 vcf_reader = self.get_header() 8185 8186 # Get HGVS field 8187 hgvs_field = ( 8188 param.get("calculation", {}) 8189 .get("calculations", {}) 8190 .get("NOMEN", {}) 8191 .get("options", {}) 8192 .get("hgvs_field", "hgvs") 8193 ) 8194 8195 # Get transcripts 8196 transcripts_file = ( 8197 param.get("calculation", {}) 8198 .get("calculations", {}) 8199 .get("NOMEN", {}) 8200 .get("options", {}) 8201 .get("transcripts", None) 8202 ) 8203 transcripts_file = full_path(transcripts_file) 8204 transcripts = [] 8205 if transcripts_file: 8206 if os.path.exists(transcripts_file): 8207 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8208 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8209 else: 8210 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8211 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8212 8213 # Added columns 8214 added_columns = [] 8215 8216 # Explode HGVS field in column 8217 added_columns += self.explode_infos(fields=[hgvs_field]) 8218 8219 # extra infos 8220 extra_infos = self.get_extra_infos() 8221 extra_field = prefix + hgvs_field 8222 8223 if extra_field in extra_infos: 8224 8225 # Create dataframe 8226 dataframe_hgvs = self.get_query_to_df( 8227 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8228 ) 8229 8230 # Create main NOMEN column 8231 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8232 lambda x: find_nomen(str(x), transcripts=transcripts) 8233 ) 8234 8235 # Explode NOMEN Structure and create SQL set for update 8236 sql_nomen_fields = [] 8237 for nomen_field in nomen_dict: 8238 8239 # Explode each field into a column 8240 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8241 lambda x: dict(x).get(nomen_field, "") 8242 ) 8243 8244 # Create VCF header field 8245 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8246 nomen_field, 8247 ".", 8248 "String", 8249 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8250 "howard calculation", 8251 "0", 8252 self.code_type_map.get("String"), 8253 ) 8254 sql_nomen_fields.append( 8255 f""" 8256 CASE 8257 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8258 THEN concat( 8259 ';{nomen_field}=', 8260 dataframe_hgvs."{nomen_field}" 8261 ) 8262 ELSE '' 8263 END 8264 """ 8265 ) 8266 8267 # SQL set for update 8268 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8269 8270 # Update 8271 sql_update = f""" 8272 UPDATE variants 8273 SET "INFO" = 8274 concat( 8275 CASE 8276 WHEN "INFO" IS NULL 8277 THEN '' 8278 ELSE "INFO" 8279 END, 8280 {sql_nomen_fields_set} 8281 ) 8282 FROM dataframe_hgvs 8283 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8284 AND variants."POS" = dataframe_hgvs."POS" 8285 AND variants."REF" = dataframe_hgvs."REF" 8286 AND variants."ALT" = dataframe_hgvs."ALT" 8287 """ 8288 self.conn.execute(sql_update) 8289 8290 # Delete dataframe 8291 del dataframe_hgvs 8292 gc.collect() 8293 8294 # Remove added columns 8295 for added_column in added_columns: 8296 self.drop_column(column=added_column) 8297 8298 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8299 """ 8300 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8301 pipeline/sample for a variant and updates the variant information in a VCF file. 8302 8303 :param tag: The `tag` parameter is a string that represents the annotation field for the 8304 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8305 VCF header and to update the corresponding field in the variants table, defaults to 8306 findbypipeline 8307 :type tag: str (optional) 8308 """ 8309 8310 # if FORMAT and samples 8311 if ( 8312 "FORMAT" in self.get_header_columns_as_list() 8313 and self.get_header_sample_list() 8314 ): 8315 8316 # findbypipeline annotation field 8317 findbypipeline_tag = tag 8318 8319 # VCF infos tags 8320 vcf_infos_tags = { 8321 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8322 } 8323 8324 # Prefix 8325 prefix = self.get_explode_infos_prefix() 8326 8327 # Field 8328 findbypipeline_infos = prefix + findbypipeline_tag 8329 8330 # Variants table 8331 table_variants = self.get_table_variants() 8332 8333 # Header 8334 vcf_reader = self.get_header() 8335 8336 # Create variant id 8337 variant_id_column = self.get_variant_id_column() 8338 added_columns = [variant_id_column] 8339 8340 # variant_id, FORMAT and samples 8341 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8342 self.get_header_sample_list() 8343 ) 8344 8345 # Create dataframe 8346 dataframe_findbypipeline = self.get_query_to_df( 8347 f""" SELECT {samples_fields} FROM {table_variants} """ 8348 ) 8349 8350 # Create findbypipeline column 8351 dataframe_findbypipeline[findbypipeline_infos] = ( 8352 dataframe_findbypipeline.apply( 8353 lambda row: findbypipeline( 8354 row, samples=self.get_header_sample_list() 8355 ), 8356 axis=1, 8357 ) 8358 ) 8359 8360 # Add snpeff_hgvs to header 8361 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8362 findbypipeline_tag, 8363 ".", 8364 "String", 8365 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8366 "howard calculation", 8367 "0", 8368 self.code_type_map.get("String"), 8369 ) 8370 8371 # Update 8372 sql_update = f""" 8373 UPDATE variants 8374 SET "INFO" = 8375 concat( 8376 CASE 8377 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8378 THEN '' 8379 ELSE concat("INFO", ';') 8380 END, 8381 CASE 8382 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8383 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8384 THEN concat( 8385 '{findbypipeline_tag}=', 8386 dataframe_findbypipeline."{findbypipeline_infos}" 8387 ) 8388 ELSE '' 8389 END 8390 ) 8391 FROM dataframe_findbypipeline 8392 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8393 """ 8394 self.conn.execute(sql_update) 8395 8396 # Remove added columns 8397 for added_column in added_columns: 8398 self.drop_column(column=added_column) 8399 8400 # Delete dataframe 8401 del dataframe_findbypipeline 8402 gc.collect() 8403 8404 def calculation_genotype_concordance(self) -> None: 8405 """ 8406 The function `calculation_genotype_concordance` calculates the genotype concordance for 8407 multi-caller VCF files and updates the variant information in the database. 8408 """ 8409 8410 # if FORMAT and samples 8411 if ( 8412 "FORMAT" in self.get_header_columns_as_list() 8413 and self.get_header_sample_list() 8414 ): 8415 8416 # genotypeconcordance annotation field 8417 genotypeconcordance_tag = "genotypeconcordance" 8418 8419 # VCF infos tags 8420 vcf_infos_tags = { 8421 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8422 } 8423 8424 # Prefix 8425 prefix = self.get_explode_infos_prefix() 8426 8427 # Field 8428 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8429 8430 # Variants table 8431 table_variants = self.get_table_variants() 8432 8433 # Header 8434 vcf_reader = self.get_header() 8435 8436 # Create variant id 8437 variant_id_column = self.get_variant_id_column() 8438 added_columns = [variant_id_column] 8439 8440 # variant_id, FORMAT and samples 8441 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8442 self.get_header_sample_list() 8443 ) 8444 8445 # Create dataframe 8446 dataframe_genotypeconcordance = self.get_query_to_df( 8447 f""" SELECT {samples_fields} FROM {table_variants} """ 8448 ) 8449 8450 # Create genotypeconcordance column 8451 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8452 dataframe_genotypeconcordance.apply( 8453 lambda row: genotypeconcordance( 8454 row, samples=self.get_header_sample_list() 8455 ), 8456 axis=1, 8457 ) 8458 ) 8459 8460 # Add genotypeconcordance to header 8461 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8462 genotypeconcordance_tag, 8463 ".", 8464 "String", 8465 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8466 "howard calculation", 8467 "0", 8468 self.code_type_map.get("String"), 8469 ) 8470 8471 # Update 8472 sql_update = f""" 8473 UPDATE variants 8474 SET "INFO" = 8475 concat( 8476 CASE 8477 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8478 THEN '' 8479 ELSE concat("INFO", ';') 8480 END, 8481 CASE 8482 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8483 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8484 THEN concat( 8485 '{genotypeconcordance_tag}=', 8486 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8487 ) 8488 ELSE '' 8489 END 8490 ) 8491 FROM dataframe_genotypeconcordance 8492 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8493 """ 8494 self.conn.execute(sql_update) 8495 8496 # Remove added columns 8497 for added_column in added_columns: 8498 self.drop_column(column=added_column) 8499 8500 # Delete dataframe 8501 del dataframe_genotypeconcordance 8502 gc.collect() 8503 8504 def calculation_barcode(self, tag: str = "barcode") -> None: 8505 """ 8506 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8507 updates the INFO field in the file with the calculated barcode values. 8508 8509 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8510 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8511 the default tag name is set to "barcode", defaults to barcode 8512 :type tag: str (optional) 8513 """ 8514 8515 # if FORMAT and samples 8516 if ( 8517 "FORMAT" in self.get_header_columns_as_list() 8518 and self.get_header_sample_list() 8519 ): 8520 8521 # barcode annotation field 8522 if not tag: 8523 tag = "barcode" 8524 8525 # VCF infos tags 8526 vcf_infos_tags = { 8527 tag: "barcode calculation (VaRank)", 8528 } 8529 8530 # Prefix 8531 prefix = self.get_explode_infos_prefix() 8532 8533 # Field 8534 barcode_infos = prefix + tag 8535 8536 # Variants table 8537 table_variants = self.get_table_variants() 8538 8539 # Header 8540 vcf_reader = self.get_header() 8541 8542 # Create variant id 8543 variant_id_column = self.get_variant_id_column() 8544 added_columns = [variant_id_column] 8545 8546 # variant_id, FORMAT and samples 8547 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8548 self.get_header_sample_list() 8549 ) 8550 8551 # Create dataframe 8552 dataframe_barcode = self.get_query_to_df( 8553 f""" SELECT {samples_fields} FROM {table_variants} """ 8554 ) 8555 8556 # Create barcode column 8557 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8558 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8559 ) 8560 8561 # Add barcode to header 8562 vcf_reader.infos[tag] = vcf.parser._Info( 8563 tag, 8564 ".", 8565 "String", 8566 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8567 "howard calculation", 8568 "0", 8569 self.code_type_map.get("String"), 8570 ) 8571 8572 # Update 8573 sql_update = f""" 8574 UPDATE {table_variants} 8575 SET "INFO" = 8576 concat( 8577 CASE 8578 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8579 THEN '' 8580 ELSE concat("INFO", ';') 8581 END, 8582 CASE 8583 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8584 AND dataframe_barcode."{barcode_infos}" NOT NULL 8585 THEN concat( 8586 '{tag}=', 8587 dataframe_barcode."{barcode_infos}" 8588 ) 8589 ELSE '' 8590 END 8591 ) 8592 FROM dataframe_barcode 8593 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8594 """ 8595 self.conn.execute(sql_update) 8596 8597 # Remove added columns 8598 for added_column in added_columns: 8599 self.drop_column(column=added_column) 8600 8601 # Delete dataframe 8602 del dataframe_barcode 8603 gc.collect() 8604 8605 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8606 """ 8607 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8608 and updates the INFO field in the file with the calculated barcode values. 8609 8610 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8611 the barcode tag that will be added to the VCF file during the calculation process. If no value 8612 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8613 :type tag: str (optional) 8614 """ 8615 8616 # if FORMAT and samples 8617 if ( 8618 "FORMAT" in self.get_header_columns_as_list() 8619 and self.get_header_sample_list() 8620 ): 8621 8622 # barcode annotation field 8623 if not tag: 8624 tag = "BCF" 8625 8626 # VCF infos tags 8627 vcf_infos_tags = { 8628 tag: "barcode family calculation", 8629 f"{tag}S": "barcode family samples", 8630 } 8631 8632 # Param 8633 param = self.get_param() 8634 log.debug(f"param={param}") 8635 8636 # Prefix 8637 prefix = self.get_explode_infos_prefix() 8638 8639 # PED param 8640 ped = ( 8641 param.get("calculation", {}) 8642 .get("calculations", {}) 8643 .get("BARCODEFAMILY", {}) 8644 .get("family_pedigree", None) 8645 ) 8646 log.debug(f"ped={ped}") 8647 8648 # Load PED 8649 if ped: 8650 8651 # Pedigree is a file 8652 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8653 log.debug("Pedigree is file") 8654 with open(full_path(ped)) as ped: 8655 ped = json.load(ped) 8656 8657 # Pedigree is a string 8658 elif isinstance(ped, str): 8659 log.debug("Pedigree is str") 8660 try: 8661 ped = json.loads(ped) 8662 log.debug("Pedigree is json str") 8663 except ValueError as e: 8664 ped_samples = ped.split(",") 8665 ped = {} 8666 for ped_sample in ped_samples: 8667 ped[ped_sample] = ped_sample 8668 8669 # Pedigree is a dict 8670 elif isinstance(ped, dict): 8671 log.debug("Pedigree is dict") 8672 8673 # Pedigree is not well formatted 8674 else: 8675 msg_error = "Pedigree not well formatted" 8676 log.error(msg_error) 8677 raise ValueError(msg_error) 8678 8679 # Construct list 8680 ped_samples = list(ped.values()) 8681 8682 else: 8683 log.debug("Pedigree not defined. Take all samples") 8684 ped_samples = self.get_header_sample_list() 8685 ped = {} 8686 for ped_sample in ped_samples: 8687 ped[ped_sample] = ped_sample 8688 8689 # Check pedigree 8690 if not ped or len(ped) == 0: 8691 msg_error = f"Error in pedigree: samples {ped_samples}" 8692 log.error(msg_error) 8693 raise ValueError(msg_error) 8694 8695 # Log 8696 log.info( 8697 "Calculation 'BARCODEFAMILY' - Samples: " 8698 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8699 ) 8700 log.debug(f"ped_samples={ped_samples}") 8701 8702 # Field 8703 barcode_infos = prefix + tag 8704 8705 # Variants table 8706 table_variants = self.get_table_variants() 8707 8708 # Header 8709 vcf_reader = self.get_header() 8710 8711 # Create variant id 8712 variant_id_column = self.get_variant_id_column() 8713 added_columns = [variant_id_column] 8714 8715 # variant_id, FORMAT and samples 8716 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8717 ped_samples 8718 ) 8719 8720 # Create dataframe 8721 dataframe_barcode = self.get_query_to_df( 8722 f""" SELECT {samples_fields} FROM {table_variants} """ 8723 ) 8724 8725 # Create barcode column 8726 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8727 lambda row: barcode(row, samples=ped_samples), axis=1 8728 ) 8729 8730 # Add barcode family to header 8731 # Add vaf_normalization to header 8732 vcf_reader.formats[tag] = vcf.parser._Format( 8733 id=tag, 8734 num=".", 8735 type="String", 8736 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8737 type_code=self.code_type_map.get("String"), 8738 ) 8739 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8740 id=f"{tag}S", 8741 num=".", 8742 type="String", 8743 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8744 type_code=self.code_type_map.get("String"), 8745 ) 8746 8747 # Update 8748 # for sample in ped_samples: 8749 sql_update_set = [] 8750 for sample in self.get_header_sample_list() + ["FORMAT"]: 8751 if sample in ped_samples: 8752 value = f'dataframe_barcode."{barcode_infos}"' 8753 value_samples = "'" + ",".join(ped_samples) + "'" 8754 elif sample == "FORMAT": 8755 value = f"'{tag}'" 8756 value_samples = f"'{tag}S'" 8757 else: 8758 value = "'.'" 8759 value_samples = "'.'" 8760 format_regex = r"[a-zA-Z0-9\s]" 8761 sql_update_set.append( 8762 f""" 8763 "{sample}" = 8764 concat( 8765 CASE 8766 WHEN {table_variants}."{sample}" = './.' 8767 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8768 ELSE {table_variants}."{sample}" 8769 END, 8770 ':', 8771 {value}, 8772 ':', 8773 {value_samples} 8774 ) 8775 """ 8776 ) 8777 8778 sql_update_set_join = ", ".join(sql_update_set) 8779 sql_update = f""" 8780 UPDATE {table_variants} 8781 SET {sql_update_set_join} 8782 FROM dataframe_barcode 8783 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8784 """ 8785 self.conn.execute(sql_update) 8786 8787 # Remove added columns 8788 for added_column in added_columns: 8789 self.drop_column(column=added_column) 8790 8791 # Delete dataframe 8792 del dataframe_barcode 8793 gc.collect() 8794 8795 def calculation_trio(self) -> None: 8796 """ 8797 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8798 information to the INFO field of each variant. 8799 """ 8800 8801 # if FORMAT and samples 8802 if ( 8803 "FORMAT" in self.get_header_columns_as_list() 8804 and self.get_header_sample_list() 8805 ): 8806 8807 # trio annotation field 8808 trio_tag = "trio" 8809 8810 # VCF infos tags 8811 vcf_infos_tags = { 8812 "trio": "trio calculation", 8813 } 8814 8815 # Param 8816 param = self.get_param() 8817 8818 # Prefix 8819 prefix = self.get_explode_infos_prefix() 8820 8821 # Trio param 8822 trio_ped = ( 8823 param.get("calculation", {}) 8824 .get("calculations", {}) 8825 .get("TRIO", {}) 8826 .get("trio_pedigree", None) 8827 ) 8828 8829 # Load trio 8830 if trio_ped: 8831 8832 # Trio pedigree is a file 8833 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8834 log.debug("TRIO pedigree is file") 8835 with open(full_path(trio_ped)) as trio_ped: 8836 trio_ped = json.load(trio_ped) 8837 8838 # Trio pedigree is a string 8839 elif isinstance(trio_ped, str): 8840 log.debug("TRIO pedigree is str") 8841 try: 8842 trio_ped = json.loads(trio_ped) 8843 log.debug("TRIO pedigree is json str") 8844 except ValueError as e: 8845 trio_samples = trio_ped.split(",") 8846 if len(trio_samples) == 3: 8847 trio_ped = { 8848 "father": trio_samples[0], 8849 "mother": trio_samples[1], 8850 "child": trio_samples[2], 8851 } 8852 log.debug("TRIO pedigree is list str") 8853 else: 8854 msg_error = "TRIO pedigree not well formatted" 8855 log.error(msg_error) 8856 raise ValueError(msg_error) 8857 8858 # Trio pedigree is a dict 8859 elif isinstance(trio_ped, dict): 8860 log.debug("TRIO pedigree is dict") 8861 8862 # Trio pedigree is not well formatted 8863 else: 8864 msg_error = "TRIO pedigree not well formatted" 8865 log.error(msg_error) 8866 raise ValueError(msg_error) 8867 8868 # Construct trio list 8869 trio_samples = [ 8870 trio_ped.get("father", ""), 8871 trio_ped.get("mother", ""), 8872 trio_ped.get("child", ""), 8873 ] 8874 8875 else: 8876 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8877 samples_list = self.get_header_sample_list() 8878 if len(samples_list) >= 3: 8879 trio_samples = self.get_header_sample_list()[0:3] 8880 trio_ped = { 8881 "father": trio_samples[0], 8882 "mother": trio_samples[1], 8883 "child": trio_samples[2], 8884 } 8885 else: 8886 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8887 log.error(msg_error) 8888 raise ValueError(msg_error) 8889 8890 # Check trio pedigree 8891 if not trio_ped or len(trio_ped) != 3: 8892 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8893 log.error(msg_error) 8894 raise ValueError(msg_error) 8895 8896 # Log 8897 log.info( 8898 f"Calculation 'TRIO' - Samples: " 8899 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8900 ) 8901 8902 # Field 8903 trio_infos = prefix + trio_tag 8904 8905 # Variants table 8906 table_variants = self.get_table_variants() 8907 8908 # Header 8909 vcf_reader = self.get_header() 8910 8911 # Create variant id 8912 variant_id_column = self.get_variant_id_column() 8913 added_columns = [variant_id_column] 8914 8915 # variant_id, FORMAT and samples 8916 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8917 self.get_header_sample_list() 8918 ) 8919 8920 # Create dataframe 8921 dataframe_trio = self.get_query_to_df( 8922 f""" SELECT {samples_fields} FROM {table_variants} """ 8923 ) 8924 8925 # Create trio column 8926 dataframe_trio[trio_infos] = dataframe_trio.apply( 8927 lambda row: trio(row, samples=trio_samples), axis=1 8928 ) 8929 8930 # Add trio to header 8931 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8932 trio_tag, 8933 ".", 8934 "String", 8935 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8936 "howard calculation", 8937 "0", 8938 self.code_type_map.get("String"), 8939 ) 8940 8941 # Update 8942 sql_update = f""" 8943 UPDATE {table_variants} 8944 SET "INFO" = 8945 concat( 8946 CASE 8947 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8948 THEN '' 8949 ELSE concat("INFO", ';') 8950 END, 8951 CASE 8952 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8953 AND dataframe_trio."{trio_infos}" NOT NULL 8954 THEN concat( 8955 '{trio_tag}=', 8956 dataframe_trio."{trio_infos}" 8957 ) 8958 ELSE '' 8959 END 8960 ) 8961 FROM dataframe_trio 8962 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8963 """ 8964 self.conn.execute(sql_update) 8965 8966 # Remove added columns 8967 for added_column in added_columns: 8968 self.drop_column(column=added_column) 8969 8970 # Delete dataframe 8971 del dataframe_trio 8972 gc.collect() 8973 8974 def calculation_vaf_normalization(self) -> None: 8975 """ 8976 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8977 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8978 :return: The function does not return anything. 8979 """ 8980 8981 # if FORMAT and samples 8982 if ( 8983 "FORMAT" in self.get_header_columns_as_list() 8984 and self.get_header_sample_list() 8985 ): 8986 8987 # vaf_normalization annotation field 8988 vaf_normalization_tag = "VAF" 8989 8990 # VCF infos tags 8991 vcf_infos_tags = { 8992 "VAF": "VAF Variant Frequency", 8993 } 8994 8995 # Prefix 8996 prefix = self.get_explode_infos_prefix() 8997 8998 # Variants table 8999 table_variants = self.get_table_variants() 9000 9001 # Header 9002 vcf_reader = self.get_header() 9003 9004 # Do not calculate if VAF already exists 9005 if "VAF" in vcf_reader.formats: 9006 log.debug("VAF already on genotypes") 9007 return 9008 9009 # Create variant id 9010 variant_id_column = self.get_variant_id_column() 9011 added_columns = [variant_id_column] 9012 9013 # variant_id, FORMAT and samples 9014 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9015 self.get_header_sample_list() 9016 ) 9017 9018 # Create dataframe 9019 dataframe_vaf_normalization = self.get_query_to_df( 9020 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9021 ) 9022 9023 vaf_normalization_set = [] 9024 9025 # for each sample vaf_normalization 9026 for sample in self.get_header_sample_list(): 9027 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9028 lambda row: vaf_normalization(row, sample=sample), axis=1 9029 ) 9030 vaf_normalization_set.append( 9031 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9032 ) 9033 9034 # Add VAF to FORMAT 9035 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9036 "FORMAT" 9037 ].apply(lambda x: str(x) + ":VAF") 9038 vaf_normalization_set.append( 9039 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9040 ) 9041 9042 # Add vaf_normalization to header 9043 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9044 id=vaf_normalization_tag, 9045 num="1", 9046 type="Float", 9047 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9048 type_code=self.code_type_map.get("Float"), 9049 ) 9050 9051 # Create fields to add in INFO 9052 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9053 9054 # Update 9055 sql_update = f""" 9056 UPDATE {table_variants} 9057 SET {sql_vaf_normalization_set} 9058 FROM dataframe_vaf_normalization 9059 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9060 9061 """ 9062 self.conn.execute(sql_update) 9063 9064 # Remove added columns 9065 for added_column in added_columns: 9066 self.drop_column(column=added_column) 9067 9068 # Delete dataframe 9069 del dataframe_vaf_normalization 9070 gc.collect() 9071 9072 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9073 """ 9074 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9075 field in a VCF file and updates the INFO column of the variants table with the calculated 9076 statistics. 9077 9078 :param info: The `info` parameter is a string that represents the type of information for which 9079 genotype statistics are calculated. It is used to generate various VCF info tags for the 9080 statistics, such as the number of occurrences, the list of values, the minimum value, the 9081 maximum value, the mean, the median, defaults to VAF 9082 :type info: str (optional) 9083 """ 9084 9085 # if FORMAT and samples 9086 if ( 9087 "FORMAT" in self.get_header_columns_as_list() 9088 and self.get_header_sample_list() 9089 ): 9090 9091 # vaf_stats annotation field 9092 vaf_stats_tag = info + "_stats" 9093 9094 # VCF infos tags 9095 vcf_infos_tags = { 9096 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9097 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9098 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9099 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9100 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9101 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9102 info 9103 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9104 } 9105 9106 # Prefix 9107 prefix = self.get_explode_infos_prefix() 9108 9109 # Field 9110 vaf_stats_infos = prefix + vaf_stats_tag 9111 9112 # Variants table 9113 table_variants = self.get_table_variants() 9114 9115 # Header 9116 vcf_reader = self.get_header() 9117 9118 # Create variant id 9119 variant_id_column = self.get_variant_id_column() 9120 added_columns = [variant_id_column] 9121 9122 # variant_id, FORMAT and samples 9123 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9124 self.get_header_sample_list() 9125 ) 9126 9127 # Create dataframe 9128 dataframe_vaf_stats = self.get_query_to_df( 9129 f""" SELECT {samples_fields} FROM {table_variants} """ 9130 ) 9131 9132 # Create vaf_stats column 9133 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9134 lambda row: genotype_stats( 9135 row, samples=self.get_header_sample_list(), info=info 9136 ), 9137 axis=1, 9138 ) 9139 9140 # List of vcf tags 9141 sql_vaf_stats_fields = [] 9142 9143 # Check all VAF stats infos 9144 for stat in vcf_infos_tags: 9145 9146 # Extract stats 9147 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9148 lambda x: dict(x).get(stat, "") 9149 ) 9150 9151 # Add snpeff_hgvs to header 9152 vcf_reader.infos[stat] = vcf.parser._Info( 9153 stat, 9154 ".", 9155 "String", 9156 vcf_infos_tags.get(stat, "genotype statistics"), 9157 "howard calculation", 9158 "0", 9159 self.code_type_map.get("String"), 9160 ) 9161 9162 if len(sql_vaf_stats_fields): 9163 sep = ";" 9164 else: 9165 sep = "" 9166 9167 # Create fields to add in INFO 9168 sql_vaf_stats_fields.append( 9169 f""" 9170 CASE 9171 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9172 THEN concat( 9173 '{sep}{stat}=', 9174 dataframe_vaf_stats."{stat}" 9175 ) 9176 ELSE '' 9177 END 9178 """ 9179 ) 9180 9181 # SQL set for update 9182 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9183 9184 # Update 9185 sql_update = f""" 9186 UPDATE variants 9187 SET "INFO" = 9188 concat( 9189 CASE 9190 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9191 THEN '' 9192 ELSE concat("INFO", ';') 9193 END, 9194 {sql_vaf_stats_fields_set} 9195 ) 9196 FROM dataframe_vaf_stats 9197 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9198 9199 """ 9200 self.conn.execute(sql_update) 9201 9202 # Remove added columns 9203 for added_column in added_columns: 9204 self.drop_column(column=added_column) 9205 9206 # Delete dataframe 9207 del dataframe_vaf_stats 9208 gc.collect()
36 def __init__( 37 self, 38 conn=None, 39 input: str = None, 40 output: str = None, 41 config: dict = {}, 42 param: dict = {}, 43 load: bool = False, 44 ) -> None: 45 """ 46 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 47 header 48 49 :param conn: the connection to the database 50 :param input: the input file 51 :param output: the output file 52 :param config: a dictionary containing the configuration of the model 53 :param param: a dictionary containing the parameters of the model 54 """ 55 56 # Init variables 57 self.init_variables() 58 59 # Input 60 self.set_input(input) 61 62 # Config 63 self.set_config(config) 64 65 # Param 66 self.set_param(param) 67 68 # Output 69 self.set_output(output) 70 71 # connexion 72 self.set_connexion(conn) 73 74 # Header 75 self.set_header() 76 77 # Load data 78 if load: 79 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
81 def set_input(self, input: str = None) -> None: 82 """ 83 The function `set_input` takes a file name as input, extracts the name and extension, and sets 84 attributes in the class accordingly. 85 86 :param input: The `set_input` method in the provided code snippet is used to set attributes 87 related to the input file. Here's a breakdown of the parameters and their usage in the method: 88 :type input: str 89 """ 90 91 if input and not isinstance(input, str): 92 try: 93 self.input = input.name 94 except: 95 log.error(f"Input file '{input} in bad format") 96 raise ValueError(f"Input file '{input} in bad format") 97 else: 98 self.input = input 99 100 # Input format 101 if input: 102 input_name, input_extension = os.path.splitext(self.input) 103 self.input_name = input_name 104 self.input_extension = input_extension 105 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
107 def set_config(self, config: dict) -> None: 108 """ 109 The set_config function takes a config object and assigns it as the configuration object for the 110 class. 111 112 :param config: The `config` parameter in the `set_config` function is a dictionary object that 113 contains configuration settings for the class. When you call the `set_config` function with a 114 dictionary object as the argument, it will set that dictionary as the configuration object for 115 the class 116 :type config: dict 117 """ 118 119 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
121 def set_param(self, param: dict) -> None: 122 """ 123 This function sets a parameter object for the class based on the input dictionary. 124 125 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 126 as the `param` attribute of the class instance 127 :type param: dict 128 """ 129 130 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
132 def init_variables(self) -> None: 133 """ 134 This function initializes the variables that will be used in the rest of the class 135 """ 136 137 self.prefix = "howard" 138 self.table_variants = "variants" 139 self.dataframe = None 140 141 self.comparison_map = { 142 "gt": ">", 143 "gte": ">=", 144 "lt": "<", 145 "lte": "<=", 146 "equals": "=", 147 "contains": "SIMILAR TO", 148 } 149 150 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 151 152 self.code_type_map_to_sql = { 153 "Integer": "INTEGER", 154 "String": "VARCHAR", 155 "Float": "FLOAT", 156 "Flag": "VARCHAR", 157 } 158 159 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
161 def get_indexing(self) -> bool: 162 """ 163 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 164 returns False. 165 :return: The value of the indexing parameter. 166 """ 167 168 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
170 def get_connexion_config(self) -> dict: 171 """ 172 The function `get_connexion_config` returns a dictionary containing the configuration for a 173 connection, including the number of threads and memory limit. 174 :return: a dictionary containing the configuration for the Connexion library. 175 """ 176 177 # config 178 config = self.get_config() 179 180 # Connexion config 181 connexion_config = {} 182 threads = self.get_threads() 183 184 # Threads 185 if threads: 186 connexion_config["threads"] = threads 187 188 # Memory 189 # if config.get("memory", None): 190 # connexion_config["memory_limit"] = config.get("memory") 191 if self.get_memory(): 192 connexion_config["memory_limit"] = self.get_memory() 193 194 # Temporary directory 195 if config.get("tmp", None): 196 connexion_config["temp_directory"] = config.get("tmp") 197 198 # Access 199 if config.get("access", None): 200 access = config.get("access") 201 if access in ["RO"]: 202 access = "READ_ONLY" 203 elif access in ["RW"]: 204 access = "READ_WRITE" 205 connexion_db = self.get_connexion_db() 206 if connexion_db in ":memory:": 207 access = "READ_WRITE" 208 connexion_config["access_mode"] = access 209 210 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
212 def get_duckdb_settings(self) -> dict: 213 """ 214 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 215 string. 216 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # duckdb settings 223 duckdb_settings_dict = {} 224 if config.get("duckdb_settings", None): 225 duckdb_settings = config.get("duckdb_settings") 226 duckdb_settings = full_path(duckdb_settings) 227 # duckdb setting is a file 228 if os.path.exists(duckdb_settings): 229 with open(duckdb_settings) as json_file: 230 duckdb_settings_dict = yaml.safe_load(json_file) 231 # duckdb settings is a string 232 else: 233 duckdb_settings_dict = json.loads(duckdb_settings) 234 235 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
237 def set_connexion_db(self) -> str: 238 """ 239 The function `set_connexion_db` returns the appropriate database connection string based on the 240 input format and connection type. 241 :return: the value of the variable `connexion_db`. 242 """ 243 244 # Default connexion db 245 default_connexion_db = ":memory:" 246 247 # Find connexion db 248 if self.get_input_format() in ["db", "duckdb"]: 249 connexion_db = self.get_input() 250 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 251 connexion_db = default_connexion_db 252 elif self.get_connexion_type() in ["tmpfile"]: 253 tmp_name = tempfile.mkdtemp( 254 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 255 ) 256 connexion_db = f"{tmp_name}/tmp.db" 257 elif self.get_connexion_type() != "": 258 connexion_db = self.get_connexion_type() 259 else: 260 connexion_db = default_connexion_db 261 262 # Set connexion db 263 self.connexion_db = connexion_db 264 265 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
267 def set_connexion(self, conn) -> None: 268 """ 269 The function `set_connexion` creates a connection to a database, with options for different 270 database formats and settings. 271 272 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 273 database. If a connection is not provided, a new connection to an in-memory database is created. 274 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 275 sqlite 276 """ 277 278 # Connexion db 279 connexion_db = self.set_connexion_db() 280 281 # Connexion config 282 connexion_config = self.get_connexion_config() 283 284 # Connexion format 285 connexion_format = self.get_config().get("connexion_format", "duckdb") 286 # Set connexion format 287 self.connexion_format = connexion_format 288 289 # Connexion 290 if not conn: 291 if connexion_format in ["duckdb"]: 292 conn = duckdb.connect(connexion_db, config=connexion_config) 293 # duckDB settings 294 duckdb_settings = self.get_duckdb_settings() 295 if duckdb_settings: 296 for setting in duckdb_settings: 297 setting_value = duckdb_settings.get(setting) 298 if isinstance(setting_value, str): 299 setting_value = f"'{setting_value}'" 300 conn.execute(f"PRAGMA {setting}={setting_value};") 301 elif connexion_format in ["sqlite"]: 302 conn = sqlite3.connect(connexion_db) 303 304 # Set connexion 305 self.conn = conn 306 307 # Log 308 log.debug(f"connexion_format: {connexion_format}") 309 log.debug(f"connexion_db: {connexion_db}") 310 log.debug(f"connexion config: {connexion_config}") 311 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
313 def set_output(self, output: str = None) -> None: 314 """ 315 The `set_output` function in Python sets the output file based on the input or a specified key 316 in the config file, extracting the output name, extension, and format. 317 318 :param output: The `output` parameter in the `set_output` method is used to specify the name of 319 the output file. If the config file has an 'output' key, the method sets the output to the value 320 of that key. If no output is provided, it sets the output to `None` 321 :type output: str 322 """ 323 324 if output and not isinstance(output, str): 325 self.output = output.name 326 else: 327 self.output = output 328 329 # Output format 330 if self.output: 331 output_name, output_extension = os.path.splitext(self.output) 332 self.output_name = output_name 333 self.output_extension = output_extension 334 self.output_format = self.output_extension.replace(".", "") 335 else: 336 self.output_name = None 337 self.output_extension = None 338 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
340 def set_header(self) -> None: 341 """ 342 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 343 """ 344 345 input_file = self.get_input() 346 default_header_list = [ 347 "##fileformat=VCFv4.2", 348 "#CHROM POS ID REF ALT QUAL FILTER INFO", 349 ] 350 351 # Full path 352 input_file = full_path(input_file) 353 354 if input_file: 355 356 input_format = self.get_input_format() 357 input_compressed = self.get_input_compressed() 358 config = self.get_config() 359 header_list = default_header_list 360 if input_format in [ 361 "vcf", 362 "hdr", 363 "tsv", 364 "csv", 365 "psv", 366 "parquet", 367 "db", 368 "duckdb", 369 ]: 370 # header provided in param 371 if config.get("header_file", None): 372 with open(config.get("header_file"), "rt") as f: 373 header_list = self.read_vcf_header(f) 374 # within a vcf file format (header within input file itsself) 375 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 376 # within a compressed vcf file format (.vcf.gz) 377 if input_compressed: 378 with bgzf.open(input_file, "rt") as f: 379 header_list = self.read_vcf_header(f) 380 # within an uncompressed vcf file format (.vcf) 381 else: 382 with open(input_file, "rt") as f: 383 header_list = self.read_vcf_header(f) 384 # header provided in default external file .hdr 385 elif os.path.exists((input_file + ".hdr")): 386 with open(input_file + ".hdr", "rt") as f: 387 header_list = self.read_vcf_header(f) 388 else: 389 try: # Try to get header info fields and file columns 390 391 with tempfile.TemporaryDirectory() as tmpdir: 392 393 # Create database 394 db_for_header = Database(database=input_file) 395 396 # Get header columns for infos fields 397 db_header_from_columns = ( 398 db_for_header.get_header_from_columns() 399 ) 400 401 # Get real columns in the file 402 db_header_columns = db_for_header.get_columns() 403 404 # Write header file 405 header_file_tmp = os.path.join(tmpdir, "header") 406 f = open(header_file_tmp, "w") 407 vcf.Writer(f, db_header_from_columns) 408 f.close() 409 410 # Replace #CHROM line with rel columns 411 header_list = db_for_header.read_header_file( 412 header_file=header_file_tmp 413 ) 414 header_list[-1] = "\t".join(db_header_columns) 415 416 except: 417 418 log.warning( 419 f"No header for file {input_file}. Set as default VCF header" 420 ) 421 header_list = default_header_list 422 423 else: # try for unknown format ? 424 425 log.error(f"Input file format '{input_format}' not available") 426 raise ValueError(f"Input file format '{input_format}' not available") 427 428 if not header_list: 429 header_list = default_header_list 430 431 # header as list 432 self.header_list = header_list 433 434 # header as VCF object 435 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 436 437 else: 438 439 self.header_list = None 440 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
442 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 443 """ 444 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 445 DataFrame based on the connection format. 446 447 :param query: The `query` parameter in the `get_query_to_df` function is a string that 448 represents the SQL query you want to execute. This query will be used to fetch data from a 449 database and convert it into a pandas DataFrame 450 :type query: str 451 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 452 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 453 function will only fetch up to that number of rows from the database query result. If no limit 454 is specified, 455 :type limit: int 456 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 457 """ 458 459 # Connexion format 460 connexion_format = self.get_connexion_format() 461 462 # Limit in query 463 if limit: 464 pd.set_option("display.max_rows", limit) 465 if connexion_format in ["duckdb"]: 466 df = ( 467 self.conn.execute(query) 468 .fetch_record_batch(limit) 469 .read_next_batch() 470 .to_pandas() 471 ) 472 elif connexion_format in ["sqlite"]: 473 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 474 475 # Full query 476 else: 477 if connexion_format in ["duckdb"]: 478 df = self.conn.execute(query).df() 479 elif connexion_format in ["sqlite"]: 480 df = pd.read_sql_query(query, self.conn) 481 482 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
484 def get_overview(self) -> None: 485 """ 486 The function prints the input, output, config, and dataframe of the current object 487 """ 488 table_variants_from = self.get_table_variants(clause="from") 489 sql_columns = self.get_header_columns_as_sql() 490 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 491 df = self.get_query_to_df(sql_query_export) 492 log.info( 493 "Input: " 494 + str(self.get_input()) 495 + " [" 496 + str(str(self.get_input_format())) 497 + "]" 498 ) 499 log.info( 500 "Output: " 501 + str(self.get_output()) 502 + " [" 503 + str(str(self.get_output_format())) 504 + "]" 505 ) 506 log.info("Config: ") 507 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 508 "\n" 509 ): 510 log.info("\t" + str(d)) 511 log.info("Param: ") 512 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 513 "\n" 514 ): 515 log.info("\t" + str(d)) 516 log.info("Sample list: " + str(self.get_header_sample_list())) 517 log.info("Dataframe: ") 518 for d in str(df).split("\n"): 519 log.info("\t" + str(d)) 520 521 # garbage collector 522 del df 523 gc.collect() 524 525 return None
The function prints the input, output, config, and dataframe of the current object
527 def get_stats(self) -> dict: 528 """ 529 The `get_stats` function calculates and returns various statistics of the current object, 530 including information about the input file, variants, samples, header fields, quality, and 531 SNVs/InDels. 532 :return: a dictionary containing various statistics of the current object. The dictionary has 533 the following structure: 534 """ 535 536 # Log 537 log.info(f"Stats Calculation...") 538 539 # table varaints 540 table_variants_from = self.get_table_variants() 541 542 # stats dict 543 stats = {"Infos": {}} 544 545 ### File 546 input_file = self.get_input() 547 stats["Infos"]["Input file"] = input_file 548 549 # Header 550 header_infos = self.get_header().infos 551 header_formats = self.get_header().formats 552 header_infos_list = list(header_infos) 553 header_formats_list = list(header_formats) 554 555 ### Variants 556 557 stats["Variants"] = {} 558 559 # Variants by chr 560 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 561 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 562 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 563 by=["CHROM"], kind="quicksort" 564 ) 565 566 # Total number of variants 567 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 568 569 # Calculate percentage 570 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 571 lambda x: (x / nb_of_variants) 572 ) 573 574 stats["Variants"]["Number of variants by chromosome"] = ( 575 nb_of_variants_by_chrom.to_dict(orient="index") 576 ) 577 578 stats["Infos"]["Number of variants"] = int(nb_of_variants) 579 580 ### Samples 581 582 # Init 583 samples = {} 584 nb_of_samples = 0 585 586 # Check Samples 587 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 588 log.debug(f"Check samples...") 589 for sample in self.get_header_sample_list(): 590 sql_query_samples = f""" 591 SELECT '{sample}' as sample, 592 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 593 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 594 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 595 FROM {table_variants_from} 596 WHERE ( 597 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 598 AND 599 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 600 ) 601 GROUP BY genotype 602 """ 603 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 604 sample_genotype_count = sql_query_genotype_df["count"].sum() 605 if len(sql_query_genotype_df): 606 nb_of_samples += 1 607 samples[f"{sample} - {sample_genotype_count} variants"] = ( 608 sql_query_genotype_df.to_dict(orient="index") 609 ) 610 611 stats["Samples"] = samples 612 stats["Infos"]["Number of samples"] = nb_of_samples 613 614 # # 615 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 616 # stats["Infos"]["Number of samples"] = nb_of_samples 617 # elif nb_of_samples: 618 # stats["Infos"]["Number of samples"] = "not a VCF format" 619 620 ### INFO and FORMAT fields 621 header_types_df = {} 622 header_types_list = { 623 "List of INFO fields": header_infos, 624 "List of FORMAT fields": header_formats, 625 } 626 i = 0 627 for header_type in header_types_list: 628 629 header_type_infos = header_types_list.get(header_type) 630 header_infos_dict = {} 631 632 for info in header_type_infos: 633 634 i += 1 635 header_infos_dict[i] = {} 636 637 # ID 638 header_infos_dict[i]["id"] = info 639 640 # num 641 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 642 if header_type_infos[info].num in genotype_map.keys(): 643 header_infos_dict[i]["Number"] = genotype_map.get( 644 header_type_infos[info].num 645 ) 646 else: 647 header_infos_dict[i]["Number"] = header_type_infos[info].num 648 649 # type 650 if header_type_infos[info].type: 651 header_infos_dict[i]["Type"] = header_type_infos[info].type 652 else: 653 header_infos_dict[i]["Type"] = "." 654 655 # desc 656 if header_type_infos[info].desc != None: 657 header_infos_dict[i]["Description"] = header_type_infos[info].desc 658 else: 659 header_infos_dict[i]["Description"] = "" 660 661 if len(header_infos_dict): 662 header_types_df[header_type] = pd.DataFrame.from_dict( 663 header_infos_dict, orient="index" 664 ).to_dict(orient="index") 665 666 # Stats 667 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 668 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 669 stats["Header"] = header_types_df 670 671 ### QUAL 672 if "QUAL" in self.get_header_columns(): 673 sql_query_qual = f""" 674 SELECT 675 avg(CAST(QUAL AS INTEGER)) AS Average, 676 min(CAST(QUAL AS INTEGER)) AS Minimum, 677 max(CAST(QUAL AS INTEGER)) AS Maximum, 678 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 679 median(CAST(QUAL AS INTEGER)) AS Median, 680 variance(CAST(QUAL AS INTEGER)) AS Variance 681 FROM {table_variants_from} 682 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 683 """ 684 685 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 686 stats["Quality"] = {"Stats": qual} 687 688 ### SNV and InDel 689 690 sql_query_snv = f""" 691 692 SELECT Type, count FROM ( 693 694 SELECT 695 'Total' AS Type, 696 count(*) AS count 697 FROM {table_variants_from} 698 699 UNION 700 701 SELECT 702 'MNV' AS Type, 703 count(*) AS count 704 FROM {table_variants_from} 705 WHERE len(REF) > 1 AND len(ALT) > 1 706 AND len(REF) = len(ALT) 707 708 UNION 709 710 SELECT 711 'InDel' AS Type, 712 count(*) AS count 713 FROM {table_variants_from} 714 WHERE len(REF) > 1 OR len(ALT) > 1 715 AND len(REF) != len(ALT) 716 717 UNION 718 719 SELECT 720 'SNV' AS Type, 721 count(*) AS count 722 FROM {table_variants_from} 723 WHERE len(REF) = 1 AND len(ALT) = 1 724 725 ) 726 727 ORDER BY count DESC 728 729 """ 730 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 731 732 sql_query_snv_substitution = f""" 733 SELECT 734 concat(REF, '>', ALT) AS 'Substitution', 735 count(*) AS count 736 FROM {table_variants_from} 737 WHERE len(REF) = 1 AND len(ALT) = 1 738 GROUP BY REF, ALT 739 ORDER BY count(*) DESC 740 """ 741 snv_substitution = ( 742 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 743 ) 744 stats["Variants"]["Counts"] = snv_indel 745 stats["Variants"]["Substitutions"] = snv_substitution 746 747 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
749 def stats_to_file(self, file: str = None) -> str: 750 """ 751 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 752 into a JSON object, and writes the JSON object to the specified file. 753 754 :param file: The `file` parameter is a string that represents the file path where the JSON data 755 will be written 756 :type file: str 757 :return: the name of the file that was written to. 758 """ 759 760 # Get stats 761 stats = self.get_stats() 762 763 # Serializing json 764 json_object = json.dumps(stats, indent=4) 765 766 # Writing to sample.json 767 with open(file, "w") as outfile: 768 outfile.write(json_object) 769 770 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
772 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 773 """ 774 The `print_stats` function generates a markdown file and prints the statistics contained in a 775 JSON file in a formatted manner. 776 777 :param output_file: The `output_file` parameter is a string that specifies the path and filename 778 of the output file where the stats will be printed in Markdown format. If no `output_file` is 779 provided, a temporary directory will be created and the stats will be saved in a file named 780 "stats.md" within that 781 :type output_file: str 782 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 783 file where the statistics will be saved. If no value is provided, a temporary directory will be 784 created and a default file name "stats.json" will be used 785 :type json_file: str 786 :return: The function `print_stats` does not return any value. It has a return type annotation 787 of `None`. 788 """ 789 790 # Full path 791 output_file = full_path(output_file) 792 json_file = full_path(json_file) 793 794 with tempfile.TemporaryDirectory() as tmpdir: 795 796 # Files 797 if not output_file: 798 output_file = os.path.join(tmpdir, "stats.md") 799 if not json_file: 800 json_file = os.path.join(tmpdir, "stats.json") 801 802 # Create folders 803 if not os.path.exists(os.path.dirname(output_file)): 804 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 805 if not os.path.exists(os.path.dirname(json_file)): 806 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 807 808 # Create stats JSON file 809 stats_file = self.stats_to_file(file=json_file) 810 811 # Print stats file 812 with open(stats_file) as f: 813 stats = yaml.safe_load(f) 814 815 # Output 816 output_title = [] 817 output_index = [] 818 output = [] 819 820 # Title 821 output_title.append("# HOWARD Stats") 822 823 # Index 824 output_index.append("## Index") 825 826 # Process sections 827 for section in stats: 828 infos = stats.get(section) 829 section_link = "#" + section.lower().replace(" ", "-") 830 output.append(f"## {section}") 831 output_index.append(f"- [{section}]({section_link})") 832 833 if len(infos): 834 for info in infos: 835 try: 836 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 837 is_df = True 838 except: 839 try: 840 df = pd.DataFrame.from_dict( 841 json.loads((infos.get(info))), orient="index" 842 ) 843 is_df = True 844 except: 845 is_df = False 846 if is_df: 847 output.append(f"### {info}") 848 info_link = "#" + info.lower().replace(" ", "-") 849 output_index.append(f" - [{info}]({info_link})") 850 output.append(f"{df.to_markdown(index=False)}") 851 else: 852 output.append(f"- {info}: {infos.get(info)}") 853 else: 854 output.append(f"NA") 855 856 # Write stats in markdown file 857 with open(output_file, "w") as fp: 858 for item in output_title: 859 fp.write("%s\n" % item) 860 for item in output_index: 861 fp.write("%s\n" % item) 862 for item in output: 863 fp.write("%s\n" % item) 864 865 # Output stats in markdown 866 print("") 867 print("\n\n".join(output_title)) 868 print("") 869 print("\n\n".join(output)) 870 print("") 871 872 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
874 def get_input(self) -> str: 875 """ 876 It returns the value of the input variable. 877 :return: The input is being returned. 878 """ 879 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
881 def get_input_format(self, input_file: str = None) -> str: 882 """ 883 This function returns the format of the input variable, either from the provided input file or 884 by prompting for input. 885 886 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 887 represents the file path of the input file. If no `input_file` is provided when calling the 888 method, it will default to `None` 889 :type input_file: str 890 :return: The format of the input variable is being returned. 891 """ 892 893 if not input_file: 894 input_file = self.get_input() 895 input_format = get_file_format(input_file) 896 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
898 def get_input_compressed(self, input_file: str = None) -> str: 899 """ 900 The function `get_input_compressed` returns the format of the input variable after compressing 901 it. 902 903 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 904 that represents the file path of the input file. If no `input_file` is provided when calling the 905 method, it will default to `None` and the method will then call `self.get_input()` to 906 :type input_file: str 907 :return: The function `get_input_compressed` returns the compressed format of the input 908 variable. 909 """ 910 911 if not input_file: 912 input_file = self.get_input() 913 input_compressed = get_file_compressed(input_file) 914 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
916 def get_output(self) -> str: 917 """ 918 It returns the output of the neuron. 919 :return: The output of the neural network. 920 """ 921 922 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
924 def get_output_format(self, output_file: str = None) -> str: 925 """ 926 The function `get_output_format` returns the format of the input variable or the output file if 927 provided. 928 929 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 930 that represents the file path of the output file. If no `output_file` is provided when calling 931 the method, it will default to the output obtained from the `get_output` method of the class 932 instance. The 933 :type output_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not output_file: 938 output_file = self.get_output() 939 output_format = get_file_format(output_file) 940 941 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
943 def get_config(self) -> dict: 944 """ 945 It returns the config 946 :return: The config variable is being returned. 947 """ 948 return self.config
It returns the config
Returns
The config variable is being returned.
950 def get_param(self) -> dict: 951 """ 952 It returns the param 953 :return: The param variable is being returned. 954 """ 955 return self.param
It returns the param
Returns
The param variable is being returned.
957 def get_connexion_db(self) -> str: 958 """ 959 It returns the connexion_db attribute of the object 960 :return: The connexion_db is being returned. 961 """ 962 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
964 def get_prefix(self) -> str: 965 """ 966 It returns the prefix of the object. 967 :return: The prefix is being returned. 968 """ 969 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
971 def get_table_variants(self, clause: str = "select") -> str: 972 """ 973 This function returns the table_variants attribute of the object 974 975 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 976 defaults to select (optional) 977 :return: The table_variants attribute of the object. 978 """ 979 980 # Access 981 access = self.get_config().get("access", None) 982 983 # Clauses "select", "where", "update" 984 if clause in ["select", "where", "update"]: 985 table_variants = self.table_variants 986 # Clause "from" 987 elif clause in ["from"]: 988 # For Read Only 989 if self.get_input_format() in ["parquet"] and access in ["RO"]: 990 input_file = self.get_input() 991 table_variants = f"'{input_file}' as variants" 992 # For Read Write 993 else: 994 table_variants = f"{self.table_variants} as variants" 995 else: 996 table_variants = self.table_variants 997 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
999 def get_tmp_dir(self) -> str: 1000 """ 1001 The function `get_tmp_dir` returns the temporary directory path based on configuration 1002 parameters or a default path. 1003 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1004 configuration, parameters, and a default value of "/tmp". 1005 """ 1006 1007 return get_tmp( 1008 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1009 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1011 def get_connexion_type(self) -> str: 1012 """ 1013 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1014 1015 :return: The connexion type is being returned. 1016 """ 1017 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1019 def get_connexion(self): 1020 """ 1021 It returns the connection object 1022 1023 :return: The connection object. 1024 """ 1025 return self.conn
It returns the connection object
Returns
The connection object.
1027 def close_connexion(self) -> None: 1028 """ 1029 This function closes the connection to the database. 1030 :return: The connection is being closed. 1031 """ 1032 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1034 def get_header(self, type: str = "vcf"): 1035 """ 1036 This function returns the header of the VCF file as a list of strings 1037 1038 :param type: the type of header you want to get, defaults to vcf (optional) 1039 :return: The header of the vcf file. 1040 """ 1041 1042 if self.header_vcf: 1043 if type == "vcf": 1044 return self.header_vcf 1045 elif type == "list": 1046 return self.header_list 1047 else: 1048 if type == "vcf": 1049 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1050 return header 1051 elif type == "list": 1052 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1054 def get_header_length(self, file: str = None) -> int: 1055 """ 1056 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1057 line. 1058 1059 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1060 header file. If this argument is provided, the function will read the header from the specified 1061 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1062 :type file: str 1063 :return: the length of the header list, excluding the #CHROM line. 1064 """ 1065 1066 if file: 1067 return len(self.read_vcf_header_file(file=file)) - 1 1068 elif self.get_header(type="list"): 1069 return len(self.get_header(type="list")) - 1 1070 else: 1071 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1073 def get_header_columns(self) -> str: 1074 """ 1075 This function returns the header list of a VCF 1076 1077 :return: The length of the header list. 1078 """ 1079 if self.get_header(): 1080 return self.get_header(type="list")[-1] 1081 else: 1082 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1084 def get_header_columns_as_list(self) -> list: 1085 """ 1086 This function returns the header list of a VCF 1087 1088 :return: The length of the header list. 1089 """ 1090 if self.get_header(): 1091 return self.get_header_columns().strip().split("\t") 1092 else: 1093 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1095 def get_header_columns_as_sql(self) -> str: 1096 """ 1097 This function retruns header length (without #CHROM line) 1098 1099 :return: The length of the header list. 1100 """ 1101 sql_column_list = [] 1102 for col in self.get_header_columns_as_list(): 1103 sql_column_list.append(f'"{col}"') 1104 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1106 def get_header_sample_list(self) -> list: 1107 """ 1108 This function retruns header length (without #CHROM line) 1109 1110 :return: The length of the header list. 1111 """ 1112 return self.header_vcf.samples
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1114 def get_verbose(self) -> bool: 1115 """ 1116 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1117 exist 1118 1119 :return: The value of the key "verbose" in the config dictionary. 1120 """ 1121 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1123 def get_connexion_format(self) -> str: 1124 """ 1125 It returns the connexion format of the object. 1126 :return: The connexion_format is being returned. 1127 """ 1128 connexion_format = self.connexion_format 1129 if connexion_format not in ["duckdb", "sqlite"]: 1130 log.error(f"Unknown connexion format {connexion_format}") 1131 raise ValueError(f"Unknown connexion format {connexion_format}") 1132 else: 1133 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1135 def insert_file_to_table( 1136 self, 1137 file, 1138 columns: str, 1139 header_len: int = 0, 1140 sep: str = "\t", 1141 chunksize: int = 1000000, 1142 ) -> None: 1143 """ 1144 The function reads a file in chunks and inserts each chunk into a table based on the specified 1145 database format. 1146 1147 :param file: The `file` parameter is the file that you want to load into a table. It should be 1148 the path to the file on your system 1149 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1150 should contain the names of the columns in the table where the data will be inserted. The column 1151 names should be separated by commas within the string. For example, if you have columns named 1152 "id", "name 1153 :type columns: str 1154 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1155 the number of lines to skip at the beginning of the file before reading the actual data. This 1156 parameter allows you to skip any header information present in the file before processing the 1157 data, defaults to 0 1158 :type header_len: int (optional) 1159 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1160 separator character that is used in the file being read. In this case, the default separator is 1161 set to `\t`, which represents a tab character. You can change this parameter to a different 1162 separator character if, defaults to \t 1163 :type sep: str (optional) 1164 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1165 when processing the file in chunks. In the provided code snippet, the default value for 1166 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1167 to 1000000 1168 :type chunksize: int (optional) 1169 """ 1170 1171 # Config 1172 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1173 connexion_format = self.get_connexion_format() 1174 1175 log.debug("chunksize: " + str(chunksize)) 1176 1177 if chunksize: 1178 for chunk in pd.read_csv( 1179 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1180 ): 1181 if connexion_format in ["duckdb"]: 1182 sql_insert_into = ( 1183 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1184 ) 1185 self.conn.execute(sql_insert_into) 1186 elif connexion_format in ["sqlite"]: 1187 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1189 def load_data( 1190 self, 1191 input_file: str = None, 1192 drop_variants_table: bool = False, 1193 sample_size: int = 20480, 1194 ) -> None: 1195 """ 1196 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1197 table before loading the data and specify a sample size. 1198 1199 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1200 table 1201 :type input_file: str 1202 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1203 determines whether the variants table should be dropped before loading the data. If set to 1204 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1205 not be dropped, defaults to False 1206 :type drop_variants_table: bool (optional) 1207 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1208 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1209 20480 1210 :type sample_size: int (optional) 1211 """ 1212 1213 log.info("Loading...") 1214 1215 # change input file 1216 if input_file: 1217 self.set_input(input_file) 1218 self.set_header() 1219 1220 # drop variants table 1221 if drop_variants_table: 1222 self.drop_variants_table() 1223 1224 # get table variants 1225 table_variants = self.get_table_variants() 1226 1227 # Access 1228 access = self.get_config().get("access", None) 1229 log.debug(f"access: {access}") 1230 1231 # Input format and compress 1232 input_format = self.get_input_format() 1233 input_compressed = self.get_input_compressed() 1234 log.debug(f"input_format: {input_format}") 1235 log.debug(f"input_compressed: {input_compressed}") 1236 1237 # input_compressed_format 1238 if input_compressed: 1239 input_compressed_format = "gzip" 1240 else: 1241 input_compressed_format = "none" 1242 log.debug(f"input_compressed_format: {input_compressed_format}") 1243 1244 # Connexion format 1245 connexion_format = self.get_connexion_format() 1246 1247 # Sample size 1248 if not sample_size: 1249 sample_size = -1 1250 log.debug(f"sample_size: {sample_size}") 1251 1252 # Load data 1253 log.debug(f"Load Data from {input_format}") 1254 1255 # DuckDB connexion 1256 if connexion_format in ["duckdb"]: 1257 1258 # Database already exists 1259 if self.input_format in ["db", "duckdb"]: 1260 1261 if connexion_format in ["duckdb"]: 1262 log.debug(f"Input file format '{self.input_format}' duckDB") 1263 else: 1264 log.error( 1265 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1266 ) 1267 raise ValueError( 1268 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1269 ) 1270 1271 # Load from existing database format 1272 else: 1273 1274 try: 1275 # Create Table or View 1276 database = Database(database=self.input) 1277 sql_from = database.get_sql_from(sample_size=sample_size) 1278 1279 if access in ["RO"]: 1280 sql_load = ( 1281 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1282 ) 1283 else: 1284 sql_load = ( 1285 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1286 ) 1287 self.conn.execute(sql_load) 1288 1289 except: 1290 # Format not available 1291 log.error(f"Input file format '{self.input_format}' not available") 1292 raise ValueError( 1293 f"Input file format '{self.input_format}' not available" 1294 ) 1295 1296 # SQLite connexion 1297 elif connexion_format in ["sqlite"] and input_format in [ 1298 "vcf", 1299 "tsv", 1300 "csv", 1301 "psv", 1302 ]: 1303 1304 # Main structure 1305 structure = { 1306 "#CHROM": "VARCHAR", 1307 "POS": "INTEGER", 1308 "ID": "VARCHAR", 1309 "REF": "VARCHAR", 1310 "ALT": "VARCHAR", 1311 "QUAL": "VARCHAR", 1312 "FILTER": "VARCHAR", 1313 "INFO": "VARCHAR", 1314 } 1315 1316 # Strcuture with samples 1317 structure_complete = structure 1318 if self.get_header_sample_list(): 1319 structure["FORMAT"] = "VARCHAR" 1320 for sample in self.get_header_sample_list(): 1321 structure_complete[sample] = "VARCHAR" 1322 1323 # Columns list for create and insert 1324 sql_create_table_columns = [] 1325 sql_create_table_columns_list = [] 1326 for column in structure_complete: 1327 column_type = structure_complete[column] 1328 sql_create_table_columns.append( 1329 f'"{column}" {column_type} default NULL' 1330 ) 1331 sql_create_table_columns_list.append(f'"{column}"') 1332 1333 # Create database 1334 log.debug(f"Create Table {table_variants}") 1335 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1336 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1337 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1338 self.conn.execute(sql_create_table) 1339 1340 # chunksize define length of file chunk load file 1341 chunksize = 100000 1342 1343 # delimiter 1344 delimiter = file_format_delimiters.get(input_format, "\t") 1345 1346 # Load the input file 1347 with open(self.input, "rt") as input_file: 1348 1349 # Use the appropriate file handler based on the input format 1350 if input_compressed: 1351 input_file = bgzf.open(self.input, "rt") 1352 if input_format in ["vcf"]: 1353 header_len = self.get_header_length() 1354 else: 1355 header_len = 0 1356 1357 # Insert the file contents into a table 1358 self.insert_file_to_table( 1359 input_file, 1360 columns=sql_create_table_columns_list_sql, 1361 header_len=header_len, 1362 sep=delimiter, 1363 chunksize=chunksize, 1364 ) 1365 1366 else: 1367 log.error( 1368 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1369 ) 1370 raise ValueError( 1371 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1372 ) 1373 1374 # Explode INFOS fields into table fields 1375 if self.get_explode_infos(): 1376 self.explode_infos( 1377 prefix=self.get_explode_infos_prefix(), 1378 fields=self.get_explode_infos_fields(), 1379 force=True, 1380 ) 1381 1382 # Create index after insertion 1383 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1385 def get_explode_infos(self) -> bool: 1386 """ 1387 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1388 to False if it is not set. 1389 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1390 value. If the parameter is not present, it will return False. 1391 """ 1392 1393 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1395 def get_explode_infos_fields( 1396 self, 1397 explode_infos_fields: str = None, 1398 remove_fields_not_in_header: bool = False, 1399 ) -> list: 1400 """ 1401 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1402 the input parameter `explode_infos_fields`. 1403 1404 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1405 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1406 comma-separated list of field names to explode 1407 :type explode_infos_fields: str 1408 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1409 flag that determines whether to remove fields that are not present in the header. If it is set 1410 to `True`, any field that is not in the header will be excluded from the list of exploded 1411 information fields. If it is set to `, defaults to False 1412 :type remove_fields_not_in_header: bool (optional) 1413 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1414 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1415 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1416 Otherwise, it returns a list of exploded information fields after removing any spaces and 1417 splitting the string by commas. 1418 """ 1419 1420 # If no fields, get it in param 1421 if not explode_infos_fields: 1422 explode_infos_fields = ( 1423 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1424 ) 1425 1426 # If no fields, defined as all fields in header using keyword 1427 if not explode_infos_fields: 1428 explode_infos_fields = "*" 1429 1430 # If fields list not empty 1431 if explode_infos_fields: 1432 1433 # Input fields list 1434 if isinstance(explode_infos_fields, str): 1435 fields_input = explode_infos_fields.split(",") 1436 elif isinstance(explode_infos_fields, list): 1437 fields_input = explode_infos_fields 1438 else: 1439 fields_input = [] 1440 1441 # Fields list without * keyword 1442 fields_without_all = fields_input.copy() 1443 if "*".casefold() in (item.casefold() for item in fields_without_all): 1444 fields_without_all.remove("*") 1445 1446 # Fields in header 1447 fields_in_header = sorted(list(set(self.get_header().infos))) 1448 1449 # Construct list of fields 1450 fields_output = [] 1451 for field in fields_input: 1452 1453 # Strip field 1454 field = field.strip() 1455 1456 # format keyword * in regex 1457 if field.upper() in ["*"]: 1458 field = ".*" 1459 1460 # Find all fields with pattern 1461 r = re.compile(field) 1462 fields_search = sorted(list(filter(r.match, fields_in_header))) 1463 1464 # Remove fields input from search 1465 if fields_search != [field]: 1466 fields_search = sorted( 1467 list(set(fields_search).difference(fields_input)) 1468 ) 1469 1470 # If field is not in header (avoid not well formatted header) 1471 if not fields_search and not remove_fields_not_in_header: 1472 fields_search = [field] 1473 1474 # Add found fields 1475 for new_field in fields_search: 1476 # Add field, if not already exists, and if it is in header (if asked) 1477 if ( 1478 new_field not in fields_output 1479 and ( 1480 not remove_fields_not_in_header 1481 or new_field in fields_in_header 1482 ) 1483 and new_field not in [".*"] 1484 ): 1485 fields_output.append(new_field) 1486 1487 return fields_output 1488 1489 else: 1490 1491 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1493 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1494 """ 1495 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1496 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1497 not provided. 1498 1499 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1500 prefix to be used for exploding or expanding information 1501 :type explode_infos_prefix: str 1502 :return: the value of the variable `explode_infos_prefix`. 1503 """ 1504 1505 if not explode_infos_prefix: 1506 explode_infos_prefix = ( 1507 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1508 ) 1509 1510 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1512 def add_column( 1513 self, 1514 table_name, 1515 column_name, 1516 column_type, 1517 default_value=None, 1518 drop: bool = False, 1519 ) -> dict: 1520 """ 1521 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1522 doesn't already exist. 1523 1524 :param table_name: The name of the table to which you want to add a column 1525 :param column_name: The parameter "column_name" is the name of the column that you want to add 1526 to the table 1527 :param column_type: The `column_type` parameter specifies the data type of the column that you 1528 want to add to the table. It should be a string that represents the desired data type, such as 1529 "INTEGER", "TEXT", "REAL", etc 1530 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1531 default value for the newly added column. If a default value is provided, it will be assigned to 1532 the column for any existing rows that do not have a value for that column 1533 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1534 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1535 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1536 to False 1537 :type drop: bool (optional) 1538 :return: a boolean value indicating whether the column was successfully added to the table. 1539 """ 1540 1541 # added 1542 added = False 1543 dropped = False 1544 1545 # Check if the column already exists in the table 1546 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1547 columns = self.get_query_to_df(query).columns.tolist() 1548 if column_name in columns: 1549 log.debug( 1550 f"The {column_name} column already exists in the {table_name} table" 1551 ) 1552 if drop: 1553 self.drop_column(table_name=table_name, column_name=column_name) 1554 dropped = True 1555 else: 1556 return None 1557 else: 1558 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1559 1560 # Add column in table 1561 add_column_query = ( 1562 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1563 ) 1564 if default_value is not None: 1565 add_column_query += f" DEFAULT {default_value}" 1566 self.execute_query(add_column_query) 1567 added = not dropped 1568 log.debug( 1569 f"The {column_name} column was successfully added to the {table_name} table" 1570 ) 1571 1572 if added: 1573 added_column = { 1574 "table_name": table_name, 1575 "column_name": column_name, 1576 "column_type": column_type, 1577 "default_value": default_value, 1578 } 1579 else: 1580 added_column = None 1581 1582 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1584 def drop_column( 1585 self, column: dict = None, table_name: str = None, column_name: str = None 1586 ) -> bool: 1587 """ 1588 The `drop_column` function drops a specified column from a given table in a database and returns 1589 True if the column was successfully dropped, and False if the column does not exist in the 1590 table. 1591 1592 :param column: The `column` parameter is a dictionary that contains information about the column 1593 you want to drop. It has two keys: 1594 :type column: dict 1595 :param table_name: The `table_name` parameter is the name of the table from which you want to 1596 drop a column 1597 :type table_name: str 1598 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1599 from the table 1600 :type column_name: str 1601 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1602 and False if the column does not exist in the table. 1603 """ 1604 1605 # Find column infos 1606 if column: 1607 if isinstance(column, dict): 1608 table_name = column.get("table_name", None) 1609 column_name = column.get("column_name", None) 1610 elif isinstance(column, str): 1611 table_name = self.get_table_variants() 1612 column_name = column 1613 else: 1614 table_name = None 1615 column_name = None 1616 1617 if not table_name and not column_name: 1618 return False 1619 1620 # Removed 1621 removed = False 1622 1623 # Check if the column already exists in the table 1624 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1625 columns = self.get_query_to_df(query).columns.tolist() 1626 if column_name in columns: 1627 log.debug(f"The {column_name} column exists in the {table_name} table") 1628 else: 1629 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1630 return False 1631 1632 # Add column in table # ALTER TABLE integers DROP k 1633 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1634 self.execute_query(add_column_query) 1635 removed = True 1636 log.debug( 1637 f"The {column_name} column was successfully dropped to the {table_name} table" 1638 ) 1639 1640 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1642 def explode_infos( 1643 self, 1644 prefix: str = None, 1645 create_index: bool = False, 1646 fields: list = None, 1647 force: bool = False, 1648 proccess_all_fields_together: bool = False, 1649 ) -> list: 1650 """ 1651 The `explode_infos` function takes a VCF file and explodes the INFO fields into individual 1652 columns, returning a list of added columns. 1653 1654 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1655 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1656 `self.get_explode_infos_prefix()` as the prefix 1657 :type prefix: str 1658 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1659 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1660 `False`, indexes will not be created. The default value is `False`, defaults to False 1661 :type create_index: bool (optional) 1662 :param fields: The `fields` parameter is a list of INFO fields that you want to explode into 1663 individual columns. If this parameter is not provided, all INFO fields will be exploded 1664 :type fields: list 1665 :param force: The `force` parameter is a boolean flag that determines whether to drop and 1666 recreate the column if it already exists in the table. If `force` is set to `True`, the column 1667 will be dropped and recreated. If `force` is set to `False`, the column will not be dropped, 1668 defaults to False 1669 :type force: bool (optional) 1670 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1671 flag that determines whether to process all the INFO fields together or individually. If set to 1672 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1673 be processed individually, defaults to False 1674 :type proccess_all_fields_together: bool (optional) 1675 :return: The function `explode_infos` returns a list of added columns. 1676 """ 1677 1678 # drop indexes 1679 self.drop_indexes() 1680 1681 # connexion format 1682 connexion_format = self.get_connexion_format() 1683 1684 # Access 1685 access = self.get_config().get("access", None) 1686 1687 # Added columns 1688 added_columns = [] 1689 1690 if access not in ["RO"]: 1691 1692 # prefix 1693 if prefix in [None, True] or not isinstance(prefix, str): 1694 if self.get_explode_infos_prefix() not in [None, True]: 1695 prefix = self.get_explode_infos_prefix() 1696 else: 1697 prefix = "INFO/" 1698 1699 # table variants 1700 table_variants = self.get_table_variants(clause="select") 1701 1702 # extra infos 1703 try: 1704 extra_infos = self.get_extra_infos() 1705 except: 1706 extra_infos = [] 1707 1708 # Header infos 1709 header_infos = self.get_header().infos 1710 1711 log.debug( 1712 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1713 ) 1714 1715 sql_info_alter_table_array = [] 1716 1717 # Info fields to check 1718 fields_list = list(header_infos) 1719 if fields: 1720 fields_list += fields 1721 fields_list = set(fields_list) 1722 1723 # If no fields 1724 if not fields: 1725 fields = [] 1726 1727 # Translate fields if patterns 1728 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1729 1730 for info in fields: 1731 1732 info_id_sql = prefix + info 1733 1734 if ( 1735 info in fields_list 1736 or prefix + info in fields_list 1737 or info in extra_infos 1738 ): 1739 1740 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1741 1742 if info in header_infos: 1743 info_type = header_infos[info].type 1744 info_num = header_infos[info].num 1745 else: 1746 info_type = "String" 1747 info_num = 0 1748 1749 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1750 if info_num != 1: 1751 type_sql = "VARCHAR" 1752 1753 # Add field 1754 added_column = self.add_column( 1755 table_name=table_variants, 1756 column_name=info_id_sql, 1757 column_type=type_sql, 1758 default_value="null", 1759 drop=force, 1760 ) 1761 1762 if added_column: 1763 added_columns.append(added_column) 1764 1765 if added_column or force: 1766 1767 # add field to index 1768 self.index_additionnal_fields.append(info_id_sql) 1769 1770 # Update field array 1771 if connexion_format in ["duckdb"]: 1772 update_info_field = f""" 1773 "{info_id_sql}" = 1774 CASE 1775 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1776 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1777 END 1778 """ 1779 elif connexion_format in ["sqlite"]: 1780 update_info_field = f""" 1781 "{info_id_sql}" = 1782 CASE 1783 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1784 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1785 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1786 END 1787 """ 1788 1789 sql_info_alter_table_array.append(update_info_field) 1790 1791 if sql_info_alter_table_array: 1792 1793 # By chromosomes 1794 try: 1795 chromosomes_list = list( 1796 self.get_query_to_df( 1797 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1798 )["#CHROM"] 1799 ) 1800 except: 1801 chromosomes_list = [None] 1802 1803 for chrom in chromosomes_list: 1804 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1805 1806 # Where clause 1807 where_clause = "" 1808 if chrom and len(chromosomes_list) > 1: 1809 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1810 1811 # Update table 1812 if proccess_all_fields_together: 1813 sql_info_alter_table_array_join = ", ".join( 1814 sql_info_alter_table_array 1815 ) 1816 if sql_info_alter_table_array_join: 1817 sql_info_alter_table = f""" 1818 UPDATE {table_variants} 1819 SET {sql_info_alter_table_array_join} 1820 {where_clause} 1821 """ 1822 log.debug( 1823 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1824 ) 1825 # log.debug(sql_info_alter_table) 1826 self.conn.execute(sql_info_alter_table) 1827 else: 1828 sql_info_alter_num = 0 1829 for sql_info_alter in sql_info_alter_table_array: 1830 sql_info_alter_num += 1 1831 sql_info_alter_table = f""" 1832 UPDATE {table_variants} 1833 SET {sql_info_alter} 1834 {where_clause} 1835 """ 1836 log.debug( 1837 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1838 ) 1839 # log.debug(sql_info_alter_table) 1840 self.conn.execute(sql_info_alter_table) 1841 1842 # create indexes 1843 if create_index: 1844 self.create_indexes() 1845 1846 return added_columns
The explode_infos function takes a VCF file and explodes the INFO fields into individual
columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded - force: The
forceparameter is a boolean flag that determines whether to drop and recreate the column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set toFalse, the column will not be dropped, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually, defaults to False
Returns
The function
explode_infosreturns a list of added columns.
1848 def create_indexes(self) -> None: 1849 """ 1850 Create indexes on the table after insertion 1851 """ 1852 1853 # Access 1854 access = self.get_config().get("access", None) 1855 1856 # get table variants 1857 table_variants = self.get_table_variants("FROM") 1858 1859 if self.get_indexing() and access not in ["RO"]: 1860 # Create index 1861 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 1862 self.conn.execute(sql_create_table_index) 1863 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 1864 self.conn.execute(sql_create_table_index) 1865 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 1866 self.conn.execute(sql_create_table_index) 1867 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 1868 self.conn.execute(sql_create_table_index) 1869 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 1870 self.conn.execute(sql_create_table_index) 1871 for field in self.index_additionnal_fields: 1872 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 1873 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
1875 def drop_indexes(self) -> None: 1876 """ 1877 Create indexes on the table after insertion 1878 """ 1879 1880 # Access 1881 access = self.get_config().get("access", None) 1882 1883 # get table variants 1884 table_variants = self.get_table_variants("FROM") 1885 1886 # Get database format 1887 connexion_format = self.get_connexion_format() 1888 1889 if access not in ["RO"]: 1890 if connexion_format in ["duckdb"]: 1891 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 1892 elif connexion_format in ["sqlite"]: 1893 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 1894 1895 list_indexes = self.conn.execute(sql_list_indexes) 1896 index_names = [row[0] for row in list_indexes.fetchall()] 1897 for index in index_names: 1898 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 1899 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
1901 def read_vcf_header(self, f) -> list: 1902 """ 1903 It reads the header of a VCF file and returns a list of the header lines 1904 1905 :param f: the file object 1906 :return: The header lines of the VCF file. 1907 """ 1908 1909 header_list = [] 1910 for line in f: 1911 header_list.append(line) 1912 if line.startswith("#CHROM"): 1913 break 1914 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
1916 def read_vcf_header_file(self, file: str = None) -> list: 1917 """ 1918 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 1919 uncompressed files. 1920 1921 :param file: The `file` parameter is a string that represents the path to the VCF header file 1922 that you want to read. It is an optional parameter, so if you don't provide a value, it will 1923 default to `None` 1924 :type file: str 1925 :return: The function `read_vcf_header_file` returns a list. 1926 """ 1927 1928 if self.get_input_compressed(input_file=file): 1929 with bgzf.open(file, "rt") as f: 1930 return self.read_vcf_header(f=f) 1931 else: 1932 with open(file, "rt") as f: 1933 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
1935 def execute_query(self, query: str): 1936 """ 1937 It takes a query as an argument, executes it, and returns the results 1938 1939 :param query: The query to be executed 1940 :return: The result of the query is being returned. 1941 """ 1942 if query: 1943 return self.conn.execute(query) # .fetchall() 1944 else: 1945 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
1947 def export_output( 1948 self, 1949 output_file: str | None = None, 1950 output_header: str | None = None, 1951 export_header: bool = True, 1952 query: str | None = None, 1953 parquet_partitions: list | None = None, 1954 chunk_size: int | None = None, 1955 threads: int | None = None, 1956 sort: bool = False, 1957 index: bool = False, 1958 order_by: str | None = None, 1959 ) -> bool: 1960 """ 1961 The `export_output` function exports data from a VCF file to a specified output file in various 1962 formats, including VCF, CSV, TSV, PSV, and Parquet. 1963 1964 :param output_file: The `output_file` parameter is a string that specifies the name of the 1965 output file to be generated by the function. This is where the exported data will be saved 1966 :type output_file: str 1967 :param output_header: The `output_header` parameter is a string that specifies the name of the 1968 file where the header of the VCF file will be exported. If this parameter is not provided, the 1969 header will be exported to a file with the same name as the `output_file` parameter, but with 1970 the extension " 1971 :type output_header: str 1972 :param export_header: The `export_header` parameter is a boolean flag that determines whether 1973 the header of a VCF file should be exported to a separate file or not. If `export_header` is 1974 True, the header will be exported to a file. If `export_header` is False, the header will not 1975 be, defaults to True, if output format is not VCF 1976 :type export_header: bool (optional) 1977 :param query: The `query` parameter is an optional SQL query that can be used to filter and 1978 select specific data from the VCF file before exporting it. If provided, only the data that 1979 matches the query will be exported 1980 :type query: str 1981 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 1982 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 1983 organize data in a hierarchical directory structure based on the values of one or more columns. 1984 This can improve query performance when working with large datasets 1985 :type parquet_partitions: list 1986 :param chunk_size: The `chunk_size` parameter specifies the number of 1987 records in batch when exporting data in Parquet format. This parameter is used for 1988 partitioning the Parquet file into multiple files. 1989 :type chunk_size: int 1990 :param threads: The `threads` parameter is an optional parameter that specifies the number of 1991 threads to be used during the export process. It determines the level of parallelism and can 1992 improve the performance of the export operation. If not provided, the function will use the 1993 default number of threads 1994 :type threads: int 1995 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 1996 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 1997 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 1998 False 1999 :type sort: bool (optional) 2000 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2001 created on the output file. If `index` is True, an index will be created. If `index` is False, 2002 no index will be created. The default value is False, defaults to False 2003 :type index: bool (optional) 2004 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2005 sorting the output file. This parameter is only applicable when exporting data in VCF format 2006 :type order_by: str 2007 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2008 None if it doesn't. 2009 """ 2010 2011 # Log 2012 log.info("Exporting...") 2013 2014 # Full path 2015 output_file = full_path(output_file) 2016 output_header = full_path(output_header) 2017 2018 # Config 2019 config = self.get_config() 2020 2021 # Param 2022 param = self.get_param() 2023 2024 # Tmp files to remove 2025 tmp_to_remove = [] 2026 2027 # If no output, get it 2028 if not output_file: 2029 output_file = self.get_output() 2030 2031 # If not threads 2032 if not threads: 2033 threads = self.get_threads() 2034 2035 # Auto header name with extension 2036 if export_header or output_header: 2037 if not output_header: 2038 output_header = f"{output_file}.hdr" 2039 # Export header 2040 self.export_header(output_file=output_file) 2041 2042 # Switch off export header if VCF output 2043 output_file_type = get_file_format(output_file) 2044 if output_file_type in ["vcf"]: 2045 export_header = False 2046 tmp_to_remove.append(output_header) 2047 2048 # Chunk size 2049 if not chunk_size: 2050 chunk_size = config.get("chunk_size", None) 2051 2052 # Parquet partition 2053 if not parquet_partitions: 2054 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2055 if parquet_partitions and isinstance(parquet_partitions, str): 2056 parquet_partitions = parquet_partitions.split(",") 2057 2058 # Order by 2059 if not order_by: 2060 order_by = param.get("export", {}).get("order_by", "") 2061 2062 # Header in output 2063 header_in_output = param.get("export", {}).get("include_header", False) 2064 2065 # Database 2066 database_source = self.get_connexion() 2067 2068 # Connexion format 2069 connexion_format = self.get_connexion_format() 2070 2071 # Explode infos 2072 if self.get_explode_infos(): 2073 self.explode_infos( 2074 prefix=self.get_explode_infos_prefix(), 2075 fields=self.get_explode_infos_fields(), 2076 force=False, 2077 ) 2078 2079 # if connexion_format in ["sqlite"] or query: 2080 if connexion_format in ["sqlite"]: 2081 2082 # Export in Parquet 2083 random_tmp = "".join( 2084 random.choice(string.ascii_lowercase) for i in range(10) 2085 ) 2086 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2087 tmp_to_remove.append(database_source) 2088 2089 # Table Variants 2090 table_variants = self.get_table_variants() 2091 2092 # Create export query 2093 sql_query_export_subquery = f""" 2094 SELECT * FROM {table_variants} 2095 """ 2096 2097 # Write source file 2098 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2099 2100 # Create database 2101 database = Database( 2102 database=database_source, 2103 table="variants", 2104 header_file=output_header, 2105 conn_config=self.get_connexion_config(), 2106 ) 2107 2108 # Existing colomns header 2109 # existing_columns_header = database.get_header_file_columns(output_header) 2110 existing_columns_header = database.get_header_columns_from_database() 2111 2112 # Export file 2113 database.export( 2114 output_database=output_file, 2115 output_header=output_header, 2116 existing_columns_header=existing_columns_header, 2117 parquet_partitions=parquet_partitions, 2118 chunk_size=chunk_size, 2119 threads=threads, 2120 sort=sort, 2121 index=index, 2122 header_in_output=header_in_output, 2123 order_by=order_by, 2124 query=query, 2125 export_header=export_header, 2126 ) 2127 2128 # Remove 2129 remove_if_exists(tmp_to_remove) 2130 2131 return (os.path.exists(output_file) or None) and ( 2132 os.path.exists(output_file) or None 2133 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2135 def get_extra_infos(self, table: str = None) -> list: 2136 """ 2137 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2138 in the header. 2139 2140 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2141 name of the table from which you want to retrieve the extra columns that are not present in the 2142 header. If the `table` parameter is not provided when calling the function, it will default to 2143 using the variants 2144 :type table: str 2145 :return: A list of columns that are in the specified table but not in the header of the table. 2146 """ 2147 2148 header_columns = [] 2149 2150 if not table: 2151 table = self.get_table_variants(clause="from") 2152 header_columns = self.get_header_columns() 2153 2154 # Check all columns in the database 2155 query = f""" SELECT * FROM {table} LIMIT 1 """ 2156 log.debug(f"query {query}") 2157 table_columns = self.get_query_to_df(query).columns.tolist() 2158 extra_columns = [] 2159 2160 # Construct extra infos (not in header) 2161 for column in table_columns: 2162 if column not in header_columns: 2163 extra_columns.append(column) 2164 2165 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2167 def get_extra_infos_sql(self, table: str = None) -> str: 2168 """ 2169 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2170 by double quotes 2171 2172 :param table: The name of the table to get the extra infos from. If None, the default table is 2173 used 2174 :type table: str 2175 :return: A string of the extra infos 2176 """ 2177 2178 return ", ".join( 2179 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2180 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2182 def export_header( 2183 self, 2184 header_name: str = None, 2185 output_file: str = None, 2186 output_file_ext: str = ".hdr", 2187 clean_header: bool = True, 2188 remove_chrom_line: bool = False, 2189 ) -> str: 2190 """ 2191 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2192 specified options, and writes it to a new file. 2193 2194 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2195 this parameter is not specified, the header will be written to the output file 2196 :type header_name: str 2197 :param output_file: The `output_file` parameter in the `export_header` function is used to 2198 specify the name of the output file where the header will be written. If this parameter is not 2199 provided, the header will be written to a temporary file 2200 :type output_file: str 2201 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2202 string that represents the extension of the output header file. By default, it is set to ".hdr" 2203 if not specified by the user. This extension will be appended to the `output_file` name to 2204 create the final, defaults to .hdr 2205 :type output_file_ext: str (optional) 2206 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2207 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2208 `True`, the function will clean the header by modifying certain lines based on a specific 2209 pattern. If `clean_header`, defaults to True 2210 :type clean_header: bool (optional) 2211 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2212 boolean flag that determines whether the #CHROM line should be removed from the header before 2213 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2214 defaults to False 2215 :type remove_chrom_line: bool (optional) 2216 :return: The function `export_header` returns the name of the temporary header file that is 2217 created. 2218 """ 2219 2220 if not header_name and not output_file: 2221 output_file = self.get_output() 2222 2223 if self.get_header(): 2224 2225 # Get header object 2226 header_obj = self.get_header() 2227 2228 # Create database 2229 db_for_header = Database(database=self.get_input()) 2230 2231 # Get real columns in the file 2232 db_header_columns = db_for_header.get_columns() 2233 2234 with tempfile.TemporaryDirectory() as tmpdir: 2235 2236 # Write header file 2237 header_file_tmp = os.path.join(tmpdir, "header") 2238 f = open(header_file_tmp, "w") 2239 vcf.Writer(f, header_obj) 2240 f.close() 2241 2242 # Replace #CHROM line with rel columns 2243 header_list = db_for_header.read_header_file( 2244 header_file=header_file_tmp 2245 ) 2246 header_list[-1] = "\t".join(db_header_columns) 2247 2248 # Remove CHROM line 2249 if remove_chrom_line: 2250 header_list.pop() 2251 2252 # Clean header 2253 if clean_header: 2254 header_list_clean = [] 2255 for head in header_list: 2256 # Clean head for malformed header 2257 head_clean = head 2258 head_clean = re.subn( 2259 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2260 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2261 head_clean, 2262 2, 2263 )[0] 2264 # Write header 2265 header_list_clean.append(head_clean) 2266 header_list = header_list_clean 2267 2268 tmp_header_name = output_file + output_file_ext 2269 2270 f = open(tmp_header_name, "w") 2271 for line in header_list: 2272 f.write(line) 2273 f.close() 2274 2275 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2277 def export_variant_vcf( 2278 self, 2279 vcf_file, 2280 remove_info: bool = False, 2281 add_samples: bool = True, 2282 list_samples: list = [], 2283 where_clause: str = "", 2284 index: bool = False, 2285 threads: int | None = None, 2286 ) -> bool | None: 2287 """ 2288 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2289 remove INFO field, add samples, and control compression and indexing. 2290 2291 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2292 written to. It is the output file that will contain the filtered VCF data based on the specified 2293 parameters 2294 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2295 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2296 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2297 in, defaults to False 2298 :type remove_info: bool (optional) 2299 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2300 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2301 If set to False, the samples will be removed. The default value is True, defaults to True 2302 :type add_samples: bool (optional) 2303 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2304 in the output VCF file. By default, all samples will be included. If you provide a list of 2305 samples, only those samples will be included in the output file 2306 :type list_samples: list 2307 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2308 determines whether or not to create an index for the output VCF file. If `index` is set to 2309 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2310 :type index: bool (optional) 2311 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2312 number of threads to use for exporting the VCF file. It determines how many parallel threads 2313 will be used during the export process. More threads can potentially speed up the export process 2314 by utilizing multiple cores of the processor. If 2315 :type threads: int | None 2316 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2317 method with various parameters including the output file, query, threads, sort flag, and index 2318 flag. The `export_output` method is responsible for exporting the VCF data based on the 2319 specified parameters and configurations provided in the `export_variant_vcf` function. 2320 """ 2321 2322 # Config 2323 config = self.get_config() 2324 2325 # Extract VCF 2326 log.debug("Export VCF...") 2327 2328 # Table variants 2329 table_variants = self.get_table_variants() 2330 2331 # Threads 2332 if not threads: 2333 threads = self.get_threads() 2334 2335 # Info fields 2336 if remove_info: 2337 if not isinstance(remove_info, str): 2338 remove_info = "." 2339 info_field = f"""'{remove_info}' as INFO""" 2340 else: 2341 info_field = "INFO" 2342 2343 # Samples fields 2344 if add_samples: 2345 if not list_samples: 2346 list_samples = self.get_header_sample_list() 2347 if list_samples: 2348 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2349 else: 2350 samples_fields = "" 2351 log.debug(f"samples_fields: {samples_fields}") 2352 else: 2353 samples_fields = "" 2354 2355 # Where clause 2356 if where_clause is None: 2357 where_clause = "" 2358 2359 # Variants 2360 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2361 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2362 log.debug(f"sql_query_select={sql_query_select}") 2363 2364 return self.export_output( 2365 output_file=vcf_file, 2366 output_header=None, 2367 export_header=True, 2368 query=sql_query_select, 2369 parquet_partitions=None, 2370 chunk_size=config.get("chunk_size", None), 2371 threads=threads, 2372 sort=True, 2373 index=index, 2374 order_by=None, 2375 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2377 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2378 """ 2379 It takes a list of commands and runs them in parallel using the number of threads specified 2380 2381 :param commands: A list of commands to run 2382 :param threads: The number of threads to use, defaults to 1 (optional) 2383 """ 2384 2385 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2387 def get_threads(self, default: int = 1) -> int: 2388 """ 2389 This function returns the number of threads to use for a job, with a default value of 1 if not 2390 specified. 2391 2392 :param default: The `default` parameter in the `get_threads` method is used to specify the 2393 default number of threads to use if no specific value is provided. If no value is provided for 2394 the `threads` parameter in the configuration or input parameters, the `default` value will be 2395 used, defaults to 1 2396 :type default: int (optional) 2397 :return: the number of threads to use for the current job. 2398 """ 2399 2400 # Config 2401 config = self.get_config() 2402 2403 # Param 2404 param = self.get_param() 2405 2406 # Input threads 2407 input_thread = param.get("threads", config.get("threads", None)) 2408 2409 # Check threads 2410 if not input_thread: 2411 threads = default 2412 elif int(input_thread) <= 0: 2413 threads = os.cpu_count() 2414 else: 2415 threads = int(input_thread) 2416 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2418 def get_memory(self, default: str = None) -> str: 2419 """ 2420 This function retrieves the memory value from parameters or configuration with a default value 2421 if not found. 2422 2423 :param default: The `get_memory` function takes in a default value as a string parameter. This 2424 default value is used as a fallback in case the `memory` parameter is not provided in the 2425 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2426 the function 2427 :type default: str 2428 :return: The `get_memory` function returns a string value representing the memory parameter. If 2429 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2430 return the default value provided as an argument to the function. 2431 """ 2432 2433 # Config 2434 config = self.get_config() 2435 2436 # Param 2437 param = self.get_param() 2438 2439 # Input threads 2440 input_memory = param.get("memory", config.get("memory", None)) 2441 2442 # Check threads 2443 if input_memory: 2444 memory = input_memory 2445 else: 2446 memory = default 2447 2448 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2450 def update_from_vcf(self, vcf_file: str) -> None: 2451 """ 2452 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2453 2454 :param vcf_file: the path to the VCF file 2455 """ 2456 2457 connexion_format = self.get_connexion_format() 2458 2459 if connexion_format in ["duckdb"]: 2460 self.update_from_vcf_duckdb(vcf_file) 2461 elif connexion_format in ["sqlite"]: 2462 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2464 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2465 """ 2466 It takes a VCF file and updates the INFO column of the variants table in the database with the 2467 INFO column of the VCF file 2468 2469 :param vcf_file: the path to the VCF file 2470 """ 2471 2472 # varaints table 2473 table_variants = self.get_table_variants() 2474 2475 # Loading VCF into temporaire table 2476 skip = self.get_header_length(file=vcf_file) 2477 vcf_df = pd.read_csv( 2478 vcf_file, 2479 sep="\t", 2480 engine="c", 2481 skiprows=skip, 2482 header=0, 2483 low_memory=False, 2484 ) 2485 sql_query_update = f""" 2486 UPDATE {table_variants} as table_variants 2487 SET INFO = concat( 2488 CASE 2489 WHEN INFO NOT IN ('', '.') 2490 THEN INFO 2491 ELSE '' 2492 END, 2493 ( 2494 SELECT 2495 concat( 2496 CASE 2497 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2498 THEN ';' 2499 ELSE '' 2500 END 2501 , 2502 CASE 2503 WHEN table_parquet.INFO NOT IN ('','.') 2504 THEN table_parquet.INFO 2505 ELSE '' 2506 END 2507 ) 2508 FROM vcf_df as table_parquet 2509 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2510 AND table_parquet.\"POS\" = table_variants.\"POS\" 2511 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2512 AND table_parquet.\"REF\" = table_variants.\"REF\" 2513 AND table_parquet.INFO NOT IN ('','.') 2514 ) 2515 ) 2516 ; 2517 """ 2518 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2520 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2521 """ 2522 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2523 table, then updates the INFO column of the variants table with the INFO column of the temporary 2524 table 2525 2526 :param vcf_file: The path to the VCF file you want to update the database with 2527 """ 2528 2529 # Create a temporary table for the VCF 2530 table_vcf = "tmp_vcf" 2531 sql_create = ( 2532 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2533 ) 2534 self.conn.execute(sql_create) 2535 2536 # Loading VCF into temporaire table 2537 vcf_df = pd.read_csv( 2538 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2539 ) 2540 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2541 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2542 2543 # Update table 'variants' with VCF data 2544 # warning: CONCAT as || operator 2545 sql_query_update = f""" 2546 UPDATE variants as table_variants 2547 SET INFO = CASE 2548 WHEN INFO NOT IN ('', '.') 2549 THEN INFO 2550 ELSE '' 2551 END || 2552 ( 2553 SELECT 2554 CASE 2555 WHEN table_variants.INFO NOT IN ('','.') 2556 AND table_vcf.INFO NOT IN ('','.') 2557 THEN ';' 2558 ELSE '' 2559 END || 2560 CASE 2561 WHEN table_vcf.INFO NOT IN ('','.') 2562 THEN table_vcf.INFO 2563 ELSE '' 2564 END 2565 FROM {table_vcf} as table_vcf 2566 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2567 AND table_vcf.\"POS\" = table_variants.\"POS\" 2568 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2569 AND table_vcf.\"REF\" = table_variants.\"REF\" 2570 ) 2571 """ 2572 self.conn.execute(sql_query_update) 2573 2574 # Drop temporary table 2575 sql_drop = f"DROP TABLE {table_vcf}" 2576 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2578 def drop_variants_table(self) -> None: 2579 """ 2580 > This function drops the variants table 2581 """ 2582 2583 table_variants = self.get_table_variants() 2584 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2585 self.conn.execute(sql_table_variants)
This function drops the variants table
2587 def set_variant_id( 2588 self, variant_id_column: str = "variant_id", force: bool = None 2589 ) -> str: 2590 """ 2591 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2592 `#CHROM`, `POS`, `REF`, and `ALT` columns 2593 2594 :param variant_id_column: The name of the column to be created in the variants table, defaults 2595 to variant_id 2596 :type variant_id_column: str (optional) 2597 :param force: If True, the variant_id column will be created even if it already exists 2598 :type force: bool 2599 :return: The name of the column that contains the variant_id 2600 """ 2601 2602 # Assembly 2603 assembly = self.get_param().get( 2604 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2605 ) 2606 2607 # INFO/Tag prefix 2608 prefix = self.get_explode_infos_prefix() 2609 2610 # Explode INFO/SVTYPE 2611 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2612 2613 # variants table 2614 table_variants = self.get_table_variants() 2615 2616 # variant_id column 2617 if not variant_id_column: 2618 variant_id_column = "variant_id" 2619 2620 # Creta variant_id column 2621 if "variant_id" not in self.get_extra_infos() or force: 2622 2623 # Create column 2624 self.add_column( 2625 table_name=table_variants, 2626 column_name=variant_id_column, 2627 column_type="UBIGINT", 2628 default_value="0", 2629 ) 2630 2631 # Update column 2632 self.conn.execute( 2633 f""" 2634 UPDATE {table_variants} 2635 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2636 """ 2637 ) 2638 2639 # Remove added columns 2640 for added_column in added_columns: 2641 self.drop_column(column=added_column) 2642 2643 # return variant_id column name 2644 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2646 def get_variant_id_column( 2647 self, variant_id_column: str = "variant_id", force: bool = None 2648 ) -> str: 2649 """ 2650 This function returns the variant_id column name 2651 2652 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2653 defaults to variant_id 2654 :type variant_id_column: str (optional) 2655 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2656 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2657 if it is not already set, or if it is set 2658 :type force: bool 2659 :return: The variant_id column name. 2660 """ 2661 2662 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2668 def scan_databases( 2669 self, 2670 database_formats: list = ["parquet"], 2671 database_releases: list = ["current"], 2672 ) -> dict: 2673 """ 2674 The function `scan_databases` scans for available databases based on specified formats and 2675 releases. 2676 2677 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2678 of the databases to be scanned. In this case, the accepted format is "parquet" 2679 :type database_formats: list ["parquet"] 2680 :param database_releases: The `database_releases` parameter is a list that specifies the 2681 releases of the databases to be scanned. In the provided function, the default value for 2682 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2683 databases that are in the "current" 2684 :type database_releases: list 2685 :return: The function `scan_databases` returns a dictionary containing information about 2686 databases that match the specified formats and releases. 2687 """ 2688 2689 # Config 2690 config = self.get_config() 2691 2692 # Param 2693 param = self.get_param() 2694 2695 # Param - Assembly 2696 assembly = param.get("assembly", config.get("assembly", None)) 2697 if not assembly: 2698 assembly = DEFAULT_ASSEMBLY 2699 log.warning(f"Default assembly '{assembly}'") 2700 2701 # Scan for availabled databases 2702 log.info( 2703 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2704 ) 2705 databases_infos_dict = databases_infos( 2706 database_folder_releases=database_releases, 2707 database_formats=database_formats, 2708 assembly=assembly, 2709 config=config, 2710 ) 2711 log.info( 2712 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2713 ) 2714 2715 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2717 def annotation(self) -> None: 2718 """ 2719 It annotates the VCF file with the annotations specified in the config file. 2720 """ 2721 2722 # Config 2723 config = self.get_config() 2724 2725 # Param 2726 param = self.get_param() 2727 2728 # Param - Assembly 2729 assembly = param.get("assembly", config.get("assembly", None)) 2730 if not assembly: 2731 assembly = DEFAULT_ASSEMBLY 2732 log.warning(f"Default assembly '{assembly}'") 2733 2734 # annotations databases folders 2735 annotations_databases = set( 2736 config.get("folders", {}) 2737 .get("databases", {}) 2738 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2739 + config.get("folders", {}) 2740 .get("databases", {}) 2741 .get("parquet", ["~/howard/databases/parquet/current"]) 2742 + config.get("folders", {}) 2743 .get("databases", {}) 2744 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2745 ) 2746 2747 # Get param annotations 2748 if param.get("annotations", None) and isinstance( 2749 param.get("annotations", None), str 2750 ): 2751 log.debug(param.get("annotations", None)) 2752 param_annotation_list = param.get("annotations").split(",") 2753 else: 2754 param_annotation_list = [] 2755 2756 # Each tools param 2757 if param.get("annotation_parquet", None) != None: 2758 log.debug( 2759 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2760 ) 2761 if isinstance(param.get("annotation_parquet", None), list): 2762 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2763 else: 2764 param_annotation_list.append(param.get("annotation_parquet")) 2765 if param.get("annotation_snpsift", None) != None: 2766 if isinstance(param.get("annotation_snpsift", None), list): 2767 param_annotation_list.append( 2768 "snpsift:" 2769 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2770 ) 2771 else: 2772 param_annotation_list.append( 2773 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2774 ) 2775 if param.get("annotation_snpeff", None) != None: 2776 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2777 if param.get("annotation_bcftools", None) != None: 2778 if isinstance(param.get("annotation_bcftools", None), list): 2779 param_annotation_list.append( 2780 "bcftools:" 2781 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2782 ) 2783 else: 2784 param_annotation_list.append( 2785 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2786 ) 2787 if param.get("annotation_annovar", None) != None: 2788 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2789 if param.get("annotation_exomiser", None) != None: 2790 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2791 if param.get("annotation_splice", None) != None: 2792 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2793 2794 # Merge param annotations list 2795 param["annotations"] = ",".join(param_annotation_list) 2796 2797 # debug 2798 log.debug(f"param_annotations={param['annotations']}") 2799 2800 if param.get("annotations"): 2801 2802 # Log 2803 # log.info("Annotations - Check annotation parameters") 2804 2805 if not "annotation" in param: 2806 param["annotation"] = {} 2807 2808 # List of annotations parameters 2809 annotations_list_input = {} 2810 if isinstance(param.get("annotations", None), str): 2811 annotation_file_list = [ 2812 value for value in param.get("annotations", "").split(",") 2813 ] 2814 for annotation_file in annotation_file_list: 2815 annotations_list_input[annotation_file] = {"INFO": None} 2816 else: 2817 annotations_list_input = param.get("annotations", {}) 2818 2819 log.info(f"Quick Annotations:") 2820 for annotation_key in list(annotations_list_input.keys()): 2821 log.info(f" {annotation_key}") 2822 2823 # List of annotations and associated fields 2824 annotations_list = {} 2825 2826 for annotation_file in annotations_list_input: 2827 2828 # Explode annotations if ALL 2829 if ( 2830 annotation_file.upper() == "ALL" 2831 or annotation_file.upper().startswith("ALL:") 2832 ): 2833 2834 # check ALL parameters (formats, releases) 2835 annotation_file_split = annotation_file.split(":") 2836 database_formats = "parquet" 2837 database_releases = "current" 2838 for annotation_file_option in annotation_file_split[1:]: 2839 database_all_options_split = annotation_file_option.split("=") 2840 if database_all_options_split[0] == "format": 2841 database_formats = database_all_options_split[1].split("+") 2842 if database_all_options_split[0] == "release": 2843 database_releases = database_all_options_split[1].split("+") 2844 2845 # Scan for availabled databases 2846 databases_infos_dict = self.scan_databases( 2847 database_formats=database_formats, 2848 database_releases=database_releases, 2849 ) 2850 2851 # Add found databases in annotation parameters 2852 for database_infos in databases_infos_dict.keys(): 2853 annotations_list[database_infos] = {"INFO": None} 2854 2855 else: 2856 annotations_list[annotation_file] = annotations_list_input[ 2857 annotation_file 2858 ] 2859 2860 # Check each databases 2861 if len(annotations_list): 2862 2863 log.info( 2864 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 2865 ) 2866 2867 for annotation_file in annotations_list: 2868 2869 # Init 2870 annotations = annotations_list.get(annotation_file, None) 2871 2872 # Annotation snpEff 2873 if annotation_file.startswith("snpeff"): 2874 2875 log.debug(f"Quick Annotation snpEff") 2876 2877 if "snpeff" not in param["annotation"]: 2878 param["annotation"]["snpeff"] = {} 2879 2880 if "options" not in param["annotation"]["snpeff"]: 2881 param["annotation"]["snpeff"]["options"] = "" 2882 2883 # snpEff options in annotations 2884 param["annotation"]["snpeff"]["options"] = "".join( 2885 annotation_file.split(":")[1:] 2886 ) 2887 2888 # Annotation Annovar 2889 elif annotation_file.startswith("annovar"): 2890 2891 log.debug(f"Quick Annotation Annovar") 2892 2893 if "annovar" not in param["annotation"]: 2894 param["annotation"]["annovar"] = {} 2895 2896 if "annotations" not in param["annotation"]["annovar"]: 2897 param["annotation"]["annovar"]["annotations"] = {} 2898 2899 # Options 2900 annotation_file_split = annotation_file.split(":") 2901 for annotation_file_annotation in annotation_file_split[1:]: 2902 if annotation_file_annotation: 2903 param["annotation"]["annovar"]["annotations"][ 2904 annotation_file_annotation 2905 ] = annotations 2906 2907 # Annotation Exomiser 2908 elif annotation_file.startswith("exomiser"): 2909 2910 log.debug(f"Quick Annotation Exomiser") 2911 2912 param["annotation"]["exomiser"] = params_string_to_dict( 2913 annotation_file 2914 ) 2915 2916 # Annotation Splice 2917 elif annotation_file.startswith("splice"): 2918 2919 log.debug(f"Quick Annotation Splice") 2920 2921 param["annotation"]["splice"] = params_string_to_dict( 2922 annotation_file 2923 ) 2924 2925 # Annotation Parquet or BCFTOOLS 2926 else: 2927 2928 # Tools detection 2929 if annotation_file.startswith("bcftools:"): 2930 annotation_tool_initial = "bcftools" 2931 annotation_file = ":".join(annotation_file.split(":")[1:]) 2932 elif annotation_file.startswith("snpsift:"): 2933 annotation_tool_initial = "snpsift" 2934 annotation_file = ":".join(annotation_file.split(":")[1:]) 2935 else: 2936 annotation_tool_initial = None 2937 2938 # list of files 2939 annotation_file_list = annotation_file.replace("+", ":").split( 2940 ":" 2941 ) 2942 2943 for annotation_file in annotation_file_list: 2944 2945 if annotation_file: 2946 2947 # Annotation tool initial 2948 annotation_tool = annotation_tool_initial 2949 2950 # Find file 2951 annotation_file_found = None 2952 2953 # Expand user 2954 annotation_file = full_path(annotation_file) 2955 2956 if os.path.exists(annotation_file): 2957 annotation_file_found = annotation_file 2958 2959 else: 2960 # Find within assembly folders 2961 for annotations_database in annotations_databases: 2962 found_files = find_all( 2963 annotation_file, 2964 os.path.join( 2965 annotations_database, assembly 2966 ), 2967 ) 2968 if len(found_files) > 0: 2969 annotation_file_found = found_files[0] 2970 break 2971 if not annotation_file_found and not assembly: 2972 # Find within folders 2973 for ( 2974 annotations_database 2975 ) in annotations_databases: 2976 found_files = find_all( 2977 annotation_file, annotations_database 2978 ) 2979 if len(found_files) > 0: 2980 annotation_file_found = found_files[0] 2981 break 2982 log.debug( 2983 f"for {annotation_file} annotation_file_found={annotation_file_found}" 2984 ) 2985 2986 # Full path 2987 annotation_file_found = full_path(annotation_file_found) 2988 2989 if annotation_file_found: 2990 2991 database = Database(database=annotation_file_found) 2992 quick_annotation_format = database.get_format() 2993 quick_annotation_is_compressed = ( 2994 database.is_compressed() 2995 ) 2996 quick_annotation_is_indexed = os.path.exists( 2997 f"{annotation_file_found}.tbi" 2998 ) 2999 bcftools_preference = False 3000 3001 # Check Annotation Tool 3002 if not annotation_tool: 3003 if ( 3004 bcftools_preference 3005 and quick_annotation_format 3006 in ["vcf", "bed"] 3007 and quick_annotation_is_compressed 3008 and quick_annotation_is_indexed 3009 ): 3010 annotation_tool = "bcftools" 3011 elif quick_annotation_format in [ 3012 "vcf", 3013 "bed", 3014 "tsv", 3015 "tsv", 3016 "csv", 3017 "json", 3018 "tbl", 3019 "parquet", 3020 "duckdb", 3021 ]: 3022 annotation_tool = "parquet" 3023 else: 3024 log.error( 3025 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3026 ) 3027 raise ValueError( 3028 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3029 ) 3030 3031 log.debug( 3032 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3033 ) 3034 3035 # Annotation Tool dispatch 3036 if annotation_tool: 3037 if annotation_tool not in param["annotation"]: 3038 param["annotation"][annotation_tool] = {} 3039 if ( 3040 "annotations" 3041 not in param["annotation"][annotation_tool] 3042 ): 3043 param["annotation"][annotation_tool][ 3044 "annotations" 3045 ] = {} 3046 param["annotation"][annotation_tool][ 3047 "annotations" 3048 ][annotation_file_found] = annotations 3049 3050 else: 3051 log.error( 3052 f"Quick Annotation File {annotation_file} does NOT exist" 3053 ) 3054 3055 self.set_param(param) 3056 3057 if param.get("annotation", None): 3058 log.info("Annotations") 3059 if param.get("annotation", {}).get("parquet", None): 3060 log.info("Annotations 'parquet'...") 3061 self.annotation_parquet() 3062 if param.get("annotation", {}).get("bcftools", None): 3063 log.info("Annotations 'bcftools'...") 3064 self.annotation_bcftools() 3065 if param.get("annotation", {}).get("snpsift", None): 3066 log.info("Annotations 'snpsift'...") 3067 self.annotation_snpsift() 3068 if param.get("annotation", {}).get("annovar", None): 3069 log.info("Annotations 'annovar'...") 3070 self.annotation_annovar() 3071 if param.get("annotation", {}).get("snpeff", None): 3072 log.info("Annotations 'snpeff'...") 3073 self.annotation_snpeff() 3074 if param.get("annotation", {}).get("exomiser", None) is not None: 3075 log.info("Annotations 'exomiser'...") 3076 self.annotation_exomiser() 3077 if param.get("annotation", {}).get("splice", None) is not None: 3078 log.info("Annotations 'splice' ...") 3079 self.annotation_splice() 3080 3081 # Explode INFOS fields into table fields 3082 if self.get_explode_infos(): 3083 self.explode_infos( 3084 prefix=self.get_explode_infos_prefix(), 3085 fields=self.get_explode_infos_fields(), 3086 force=True, 3087 )
It annotates the VCF file with the annotations specified in the config file.
3089 def annotation_snpsift(self, threads: int = None) -> None: 3090 """ 3091 This function annotate with bcftools 3092 3093 :param threads: Number of threads to use 3094 :return: the value of the variable "return_value". 3095 """ 3096 3097 # DEBUG 3098 log.debug("Start annotation with bcftools databases") 3099 3100 # Threads 3101 if not threads: 3102 threads = self.get_threads() 3103 log.debug("Threads: " + str(threads)) 3104 3105 # Config 3106 config = self.get_config() 3107 log.debug("Config: " + str(config)) 3108 3109 # Config - snpSift 3110 snpsift_bin_command = get_bin_command( 3111 bin="SnpSift.jar", 3112 tool="snpsift", 3113 bin_type="jar", 3114 config=config, 3115 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3116 ) 3117 if not snpsift_bin_command: 3118 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3119 log.error(msg_err) 3120 raise ValueError(msg_err) 3121 3122 # Config - bcftools 3123 bcftools_bin_command = get_bin_command( 3124 bin="bcftools", 3125 tool="bcftools", 3126 bin_type="bin", 3127 config=config, 3128 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3129 ) 3130 if not bcftools_bin_command: 3131 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3132 log.error(msg_err) 3133 raise ValueError(msg_err) 3134 3135 # Config - BCFTools databases folders 3136 databases_folders = set( 3137 self.get_config() 3138 .get("folders", {}) 3139 .get("databases", {}) 3140 .get("annotations", ["."]) 3141 + self.get_config() 3142 .get("folders", {}) 3143 .get("databases", {}) 3144 .get("bcftools", ["."]) 3145 ) 3146 log.debug("Databases annotations: " + str(databases_folders)) 3147 3148 # Param 3149 annotations = ( 3150 self.get_param() 3151 .get("annotation", {}) 3152 .get("snpsift", {}) 3153 .get("annotations", None) 3154 ) 3155 log.debug("Annotations: " + str(annotations)) 3156 3157 # Assembly 3158 assembly = self.get_param().get( 3159 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3160 ) 3161 3162 # Data 3163 table_variants = self.get_table_variants() 3164 3165 # Check if not empty 3166 log.debug("Check if not empty") 3167 sql_query_chromosomes = ( 3168 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3169 ) 3170 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3171 if not sql_query_chromosomes_df["count"][0]: 3172 log.info(f"VCF empty") 3173 return 3174 3175 # VCF header 3176 vcf_reader = self.get_header() 3177 log.debug("Initial header: " + str(vcf_reader.infos)) 3178 3179 # Existing annotations 3180 for vcf_annotation in self.get_header().infos: 3181 3182 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3183 log.debug( 3184 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3185 ) 3186 3187 if annotations: 3188 3189 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3190 3191 # Export VCF file 3192 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3193 3194 # Init 3195 commands = {} 3196 3197 for annotation in annotations: 3198 annotation_fields = annotations[annotation] 3199 3200 # Annotation Name 3201 annotation_name = os.path.basename(annotation) 3202 3203 if not annotation_fields: 3204 annotation_fields = {"INFO": None} 3205 3206 log.debug(f"Annotation '{annotation_name}'") 3207 log.debug( 3208 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3209 ) 3210 3211 # Create Database 3212 database = Database( 3213 database=annotation, 3214 databases_folders=databases_folders, 3215 assembly=assembly, 3216 ) 3217 3218 # Find files 3219 db_file = database.get_database() 3220 db_file = full_path(db_file) 3221 db_hdr_file = database.get_header_file() 3222 db_hdr_file = full_path(db_hdr_file) 3223 db_file_type = database.get_format() 3224 db_tbi_file = f"{db_file}.tbi" 3225 db_file_compressed = database.is_compressed() 3226 3227 # Check if compressed 3228 if not db_file_compressed: 3229 log.error( 3230 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3231 ) 3232 raise ValueError( 3233 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3234 ) 3235 3236 # Check if indexed 3237 if not os.path.exists(db_tbi_file): 3238 log.error( 3239 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3240 ) 3241 raise ValueError( 3242 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3243 ) 3244 3245 # Check index - try to create if not exists 3246 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3247 log.error("Annotation failed: database not valid") 3248 log.error(f"Annotation annotation file: {db_file}") 3249 log.error(f"Annotation annotation header: {db_hdr_file}") 3250 log.error(f"Annotation annotation index: {db_tbi_file}") 3251 raise ValueError( 3252 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3253 ) 3254 else: 3255 3256 log.debug( 3257 f"Annotation '{annotation}' - file: " 3258 + str(db_file) 3259 + " and " 3260 + str(db_hdr_file) 3261 ) 3262 3263 # Load header as VCF object 3264 db_hdr_vcf = Variants(input=db_hdr_file) 3265 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3266 log.debug( 3267 "Annotation database header: " 3268 + str(db_hdr_vcf_header_infos) 3269 ) 3270 3271 # For all fields in database 3272 annotation_fields_full = False 3273 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3274 annotation_fields = { 3275 key: key for key in db_hdr_vcf_header_infos 3276 } 3277 log.debug( 3278 "Annotation database header - All annotations added: " 3279 + str(annotation_fields) 3280 ) 3281 annotation_fields_full = True 3282 3283 # # Create file for field rename 3284 # log.debug("Create file for field rename") 3285 # tmp_rename = NamedTemporaryFile( 3286 # prefix=self.get_prefix(), 3287 # dir=self.get_tmp_dir(), 3288 # suffix=".rename", 3289 # delete=False, 3290 # ) 3291 # tmp_rename_name = tmp_rename.name 3292 # tmp_files.append(tmp_rename_name) 3293 3294 # Number of fields 3295 nb_annotation_field = 0 3296 annotation_list = [] 3297 annotation_infos_rename_list = [] 3298 3299 for annotation_field in annotation_fields: 3300 3301 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3302 annotation_fields_new_name = annotation_fields.get( 3303 annotation_field, annotation_field 3304 ) 3305 if not annotation_fields_new_name: 3306 annotation_fields_new_name = annotation_field 3307 3308 # Check if field is in DB and if field is not elready in input data 3309 if ( 3310 annotation_field in db_hdr_vcf.get_header().infos 3311 and annotation_fields_new_name 3312 not in self.get_header().infos 3313 ): 3314 3315 log.info( 3316 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3317 ) 3318 3319 # BCFTools annotate param to rename fields 3320 if annotation_field != annotation_fields_new_name: 3321 annotation_infos_rename_list.append( 3322 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3323 ) 3324 3325 # Add INFO field to header 3326 db_hdr_vcf_header_infos_number = ( 3327 db_hdr_vcf_header_infos[annotation_field].num or "." 3328 ) 3329 db_hdr_vcf_header_infos_type = ( 3330 db_hdr_vcf_header_infos[annotation_field].type 3331 or "String" 3332 ) 3333 db_hdr_vcf_header_infos_description = ( 3334 db_hdr_vcf_header_infos[annotation_field].desc 3335 or f"{annotation_field} description" 3336 ) 3337 db_hdr_vcf_header_infos_source = ( 3338 db_hdr_vcf_header_infos[annotation_field].source 3339 or "unknown" 3340 ) 3341 db_hdr_vcf_header_infos_version = ( 3342 db_hdr_vcf_header_infos[annotation_field].version 3343 or "unknown" 3344 ) 3345 3346 vcf_reader.infos[annotation_fields_new_name] = ( 3347 vcf.parser._Info( 3348 annotation_fields_new_name, 3349 db_hdr_vcf_header_infos_number, 3350 db_hdr_vcf_header_infos_type, 3351 db_hdr_vcf_header_infos_description, 3352 db_hdr_vcf_header_infos_source, 3353 db_hdr_vcf_header_infos_version, 3354 self.code_type_map[ 3355 db_hdr_vcf_header_infos_type 3356 ], 3357 ) 3358 ) 3359 3360 annotation_list.append(annotation_field) 3361 3362 nb_annotation_field += 1 3363 3364 else: 3365 3366 if ( 3367 annotation_field 3368 not in db_hdr_vcf.get_header().infos 3369 ): 3370 log.warning( 3371 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3372 ) 3373 if ( 3374 annotation_fields_new_name 3375 in self.get_header().infos 3376 ): 3377 log.warning( 3378 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3379 ) 3380 3381 log.info( 3382 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3383 ) 3384 3385 annotation_infos = ",".join(annotation_list) 3386 3387 if annotation_infos != "": 3388 3389 # Annotated VCF (and error file) 3390 tmp_annotation_vcf_name = os.path.join( 3391 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3392 ) 3393 tmp_annotation_vcf_name_err = ( 3394 tmp_annotation_vcf_name + ".err" 3395 ) 3396 3397 # Add fields to annotate 3398 if not annotation_fields_full: 3399 annotation_infos_option = f"-info {annotation_infos}" 3400 else: 3401 annotation_infos_option = "" 3402 3403 # Info fields rename 3404 if annotation_infos_rename_list: 3405 annotation_infos_rename = " -c " + ",".join( 3406 annotation_infos_rename_list 3407 ) 3408 else: 3409 annotation_infos_rename = "" 3410 3411 # Annotate command 3412 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3413 3414 # Add command 3415 commands[command_annotate] = tmp_annotation_vcf_name 3416 3417 if commands: 3418 3419 # Export VCF file 3420 self.export_variant_vcf( 3421 vcf_file=tmp_vcf_name, 3422 remove_info=True, 3423 add_samples=False, 3424 index=True, 3425 ) 3426 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3427 3428 # Num command 3429 nb_command = 0 3430 3431 # Annotate 3432 for command_annotate in commands: 3433 nb_command += 1 3434 log.info( 3435 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3436 ) 3437 log.debug(f"command_annotate={command_annotate}") 3438 run_parallel_commands([command_annotate], threads) 3439 3440 # Debug 3441 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3442 3443 # Update variants 3444 log.info( 3445 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3446 ) 3447 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3449 def annotation_bcftools(self, threads: int = None) -> None: 3450 """ 3451 This function annotate with bcftools 3452 3453 :param threads: Number of threads to use 3454 :return: the value of the variable "return_value". 3455 """ 3456 3457 # DEBUG 3458 log.debug("Start annotation with bcftools databases") 3459 3460 # Threads 3461 if not threads: 3462 threads = self.get_threads() 3463 log.debug("Threads: " + str(threads)) 3464 3465 # Config 3466 config = self.get_config() 3467 log.debug("Config: " + str(config)) 3468 3469 # DEBUG 3470 delete_tmp = True 3471 if self.get_config().get("verbosity", "warning") in ["debug"]: 3472 delete_tmp = False 3473 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3474 3475 # Config - BCFTools bin command 3476 bcftools_bin_command = get_bin_command( 3477 bin="bcftools", 3478 tool="bcftools", 3479 bin_type="bin", 3480 config=config, 3481 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3482 ) 3483 if not bcftools_bin_command: 3484 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3485 log.error(msg_err) 3486 raise ValueError(msg_err) 3487 3488 # Config - BCFTools databases folders 3489 databases_folders = set( 3490 self.get_config() 3491 .get("folders", {}) 3492 .get("databases", {}) 3493 .get("annotations", ["."]) 3494 + self.get_config() 3495 .get("folders", {}) 3496 .get("databases", {}) 3497 .get("bcftools", ["."]) 3498 ) 3499 log.debug("Databases annotations: " + str(databases_folders)) 3500 3501 # Param 3502 annotations = ( 3503 self.get_param() 3504 .get("annotation", {}) 3505 .get("bcftools", {}) 3506 .get("annotations", None) 3507 ) 3508 log.debug("Annotations: " + str(annotations)) 3509 3510 # Assembly 3511 assembly = self.get_param().get( 3512 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3513 ) 3514 3515 # Data 3516 table_variants = self.get_table_variants() 3517 3518 # Check if not empty 3519 log.debug("Check if not empty") 3520 sql_query_chromosomes = ( 3521 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3522 ) 3523 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3524 if not sql_query_chromosomes_df["count"][0]: 3525 log.info(f"VCF empty") 3526 return 3527 3528 # Export in VCF 3529 log.debug("Create initial file to annotate") 3530 tmp_vcf = NamedTemporaryFile( 3531 prefix=self.get_prefix(), 3532 dir=self.get_tmp_dir(), 3533 suffix=".vcf.gz", 3534 delete=False, 3535 ) 3536 tmp_vcf_name = tmp_vcf.name 3537 3538 # VCF header 3539 vcf_reader = self.get_header() 3540 log.debug("Initial header: " + str(vcf_reader.infos)) 3541 3542 # Existing annotations 3543 for vcf_annotation in self.get_header().infos: 3544 3545 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3546 log.debug( 3547 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3548 ) 3549 3550 if annotations: 3551 3552 tmp_ann_vcf_list = [] 3553 commands = [] 3554 tmp_files = [] 3555 err_files = [] 3556 3557 for annotation in annotations: 3558 annotation_fields = annotations[annotation] 3559 3560 # Annotation Name 3561 annotation_name = os.path.basename(annotation) 3562 3563 if not annotation_fields: 3564 annotation_fields = {"INFO": None} 3565 3566 log.debug(f"Annotation '{annotation_name}'") 3567 log.debug( 3568 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3569 ) 3570 3571 # Create Database 3572 database = Database( 3573 database=annotation, 3574 databases_folders=databases_folders, 3575 assembly=assembly, 3576 ) 3577 3578 # Find files 3579 db_file = database.get_database() 3580 db_file = full_path(db_file) 3581 db_hdr_file = database.get_header_file() 3582 db_hdr_file = full_path(db_hdr_file) 3583 db_file_type = database.get_format() 3584 db_tbi_file = f"{db_file}.tbi" 3585 db_file_compressed = database.is_compressed() 3586 3587 # Check if compressed 3588 if not db_file_compressed: 3589 log.error( 3590 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3591 ) 3592 raise ValueError( 3593 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3594 ) 3595 3596 # Check if indexed 3597 if not os.path.exists(db_tbi_file): 3598 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 3599 raise ValueError( 3600 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3601 ) 3602 3603 # Check index - try to create if not exists 3604 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3605 log.error("Annotation failed: database not valid") 3606 log.error(f"Annotation annotation file: {db_file}") 3607 log.error(f"Annotation annotation header: {db_hdr_file}") 3608 log.error(f"Annotation annotation index: {db_tbi_file}") 3609 raise ValueError( 3610 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3611 ) 3612 else: 3613 3614 log.debug( 3615 f"Annotation '{annotation}' - file: " 3616 + str(db_file) 3617 + " and " 3618 + str(db_hdr_file) 3619 ) 3620 3621 # Load header as VCF object 3622 db_hdr_vcf = Variants(input=db_hdr_file) 3623 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3624 log.debug( 3625 "Annotation database header: " + str(db_hdr_vcf_header_infos) 3626 ) 3627 3628 # For all fields in database 3629 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3630 annotation_fields = { 3631 key: key for key in db_hdr_vcf_header_infos 3632 } 3633 log.debug( 3634 "Annotation database header - All annotations added: " 3635 + str(annotation_fields) 3636 ) 3637 3638 # Number of fields 3639 nb_annotation_field = 0 3640 annotation_list = [] 3641 3642 for annotation_field in annotation_fields: 3643 3644 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3645 annotation_fields_new_name = annotation_fields.get( 3646 annotation_field, annotation_field 3647 ) 3648 if not annotation_fields_new_name: 3649 annotation_fields_new_name = annotation_field 3650 3651 # Check if field is in DB and if field is not elready in input data 3652 if ( 3653 annotation_field in db_hdr_vcf.get_header().infos 3654 and annotation_fields_new_name 3655 not in self.get_header().infos 3656 ): 3657 3658 log.info( 3659 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3660 ) 3661 3662 # Add INFO field to header 3663 db_hdr_vcf_header_infos_number = ( 3664 db_hdr_vcf_header_infos[annotation_field].num or "." 3665 ) 3666 db_hdr_vcf_header_infos_type = ( 3667 db_hdr_vcf_header_infos[annotation_field].type 3668 or "String" 3669 ) 3670 db_hdr_vcf_header_infos_description = ( 3671 db_hdr_vcf_header_infos[annotation_field].desc 3672 or f"{annotation_field} description" 3673 ) 3674 db_hdr_vcf_header_infos_source = ( 3675 db_hdr_vcf_header_infos[annotation_field].source 3676 or "unknown" 3677 ) 3678 db_hdr_vcf_header_infos_version = ( 3679 db_hdr_vcf_header_infos[annotation_field].version 3680 or "unknown" 3681 ) 3682 3683 vcf_reader.infos[annotation_fields_new_name] = ( 3684 vcf.parser._Info( 3685 annotation_fields_new_name, 3686 db_hdr_vcf_header_infos_number, 3687 db_hdr_vcf_header_infos_type, 3688 db_hdr_vcf_header_infos_description, 3689 db_hdr_vcf_header_infos_source, 3690 db_hdr_vcf_header_infos_version, 3691 self.code_type_map[db_hdr_vcf_header_infos_type], 3692 ) 3693 ) 3694 3695 # annotation_list.append(annotation_field) 3696 if annotation_field != annotation_fields_new_name: 3697 annotation_list.append( 3698 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3699 ) 3700 else: 3701 annotation_list.append(annotation_field) 3702 3703 nb_annotation_field += 1 3704 3705 else: 3706 3707 if annotation_field not in db_hdr_vcf.get_header().infos: 3708 log.warning( 3709 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 3710 ) 3711 if annotation_fields_new_name in self.get_header().infos: 3712 log.warning( 3713 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 3714 ) 3715 3716 log.info( 3717 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3718 ) 3719 3720 annotation_infos = ",".join(annotation_list) 3721 3722 if annotation_infos != "": 3723 3724 # Protect header for bcftools (remove "#CHROM" and variants line) 3725 log.debug("Protect Header file - remove #CHROM line if exists") 3726 tmp_header_vcf = NamedTemporaryFile( 3727 prefix=self.get_prefix(), 3728 dir=self.get_tmp_dir(), 3729 suffix=".hdr", 3730 delete=False, 3731 ) 3732 tmp_header_vcf_name = tmp_header_vcf.name 3733 tmp_files.append(tmp_header_vcf_name) 3734 # Command 3735 if db_hdr_file.endswith(".gz"): 3736 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3737 else: 3738 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 3739 # Run 3740 run_parallel_commands([command_extract_header], 1) 3741 3742 # Find chomosomes 3743 log.debug("Find chromosomes ") 3744 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 3745 sql_query_chromosomes_df = self.get_query_to_df( 3746 sql_query_chromosomes 3747 ) 3748 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 3749 3750 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 3751 3752 # BED columns in the annotation file 3753 if db_file_type in ["bed"]: 3754 annotation_infos = "CHROM,POS,POS," + annotation_infos 3755 3756 for chrom in chomosomes_list: 3757 3758 # Create BED on initial VCF 3759 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 3760 tmp_bed = NamedTemporaryFile( 3761 prefix=self.get_prefix(), 3762 dir=self.get_tmp_dir(), 3763 suffix=".bed", 3764 delete=False, 3765 ) 3766 tmp_bed_name = tmp_bed.name 3767 tmp_files.append(tmp_bed_name) 3768 3769 # Detecte regions 3770 log.debug( 3771 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 3772 ) 3773 window = 1000000 3774 sql_query_intervals_for_bed = f""" 3775 SELECT \"#CHROM\", 3776 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 3777 \"POS\"+{window} 3778 FROM {table_variants} as table_variants 3779 WHERE table_variants.\"#CHROM\" = '{chrom}' 3780 """ 3781 regions = self.conn.execute( 3782 sql_query_intervals_for_bed 3783 ).fetchall() 3784 merged_regions = merge_regions(regions) 3785 log.debug( 3786 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 3787 ) 3788 3789 header = ["#CHROM", "START", "END"] 3790 with open(tmp_bed_name, "w") as f: 3791 # Write the header with tab delimiter 3792 f.write("\t".join(header) + "\n") 3793 for d in merged_regions: 3794 # Write each data row with tab delimiter 3795 f.write("\t".join(map(str, d)) + "\n") 3796 3797 # Tmp files 3798 tmp_annotation_vcf = NamedTemporaryFile( 3799 prefix=self.get_prefix(), 3800 dir=self.get_tmp_dir(), 3801 suffix=".vcf.gz", 3802 delete=False, 3803 ) 3804 tmp_annotation_vcf_name = tmp_annotation_vcf.name 3805 tmp_files.append(tmp_annotation_vcf_name) 3806 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 3807 tmp_annotation_vcf_name_err = ( 3808 tmp_annotation_vcf_name + ".err" 3809 ) 3810 err_files.append(tmp_annotation_vcf_name_err) 3811 3812 # Annotate Command 3813 log.debug( 3814 f"Annotation '{annotation}' - add bcftools command" 3815 ) 3816 3817 # Command 3818 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3819 3820 # Add command 3821 commands.append(command_annotate) 3822 3823 # if some commands 3824 if commands: 3825 3826 # Export VCF file 3827 self.export_variant_vcf( 3828 vcf_file=tmp_vcf_name, 3829 remove_info=True, 3830 add_samples=False, 3831 index=True, 3832 ) 3833 3834 # Threads 3835 # calculate threads for annotated commands 3836 if commands: 3837 threads_bcftools_annotate = round(threads / len(commands)) 3838 else: 3839 threads_bcftools_annotate = 1 3840 3841 if not threads_bcftools_annotate: 3842 threads_bcftools_annotate = 1 3843 3844 # Add threads option to bcftools commands 3845 if threads_bcftools_annotate > 1: 3846 commands_threaded = [] 3847 for command in commands: 3848 commands_threaded.append( 3849 command.replace( 3850 f"{bcftools_bin_command} annotate ", 3851 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 3852 ) 3853 ) 3854 commands = commands_threaded 3855 3856 # Command annotation multithreading 3857 log.debug(f"Annotation - Annotation commands: " + str(commands)) 3858 log.info( 3859 f"Annotation - Annotation multithreaded in " 3860 + str(len(commands)) 3861 + " commands" 3862 ) 3863 3864 run_parallel_commands(commands, threads) 3865 3866 # Merge 3867 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 3868 3869 if tmp_ann_vcf_list_cmd: 3870 3871 # Tmp file 3872 tmp_annotate_vcf = NamedTemporaryFile( 3873 prefix=self.get_prefix(), 3874 dir=self.get_tmp_dir(), 3875 suffix=".vcf.gz", 3876 delete=True, 3877 ) 3878 tmp_annotate_vcf_name = tmp_annotate_vcf.name 3879 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 3880 err_files.append(tmp_annotate_vcf_name_err) 3881 3882 # Tmp file remove command 3883 tmp_files_remove_command = "" 3884 if tmp_files: 3885 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 3886 3887 # Command merge 3888 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 3889 log.info( 3890 f"Annotation - Annotation merging " 3891 + str(len(commands)) 3892 + " annotated files" 3893 ) 3894 log.debug(f"Annotation - merge command: {merge_command}") 3895 run_parallel_commands([merge_command], 1) 3896 3897 # Error messages 3898 log.info(f"Error/Warning messages:") 3899 error_message_command_all = [] 3900 error_message_command_warning = [] 3901 error_message_command_err = [] 3902 for err_file in err_files: 3903 with open(err_file, "r") as f: 3904 for line in f: 3905 message = line.strip() 3906 error_message_command_all.append(message) 3907 if line.startswith("[W::"): 3908 error_message_command_warning.append(message) 3909 if line.startswith("[E::"): 3910 error_message_command_err.append( 3911 f"{err_file}: " + message 3912 ) 3913 # log info 3914 for message in list( 3915 set(error_message_command_err + error_message_command_warning) 3916 ): 3917 log.info(f" {message}") 3918 # debug info 3919 for message in list(set(error_message_command_all)): 3920 log.debug(f" {message}") 3921 # failed 3922 if len(error_message_command_err): 3923 log.error("Annotation failed: Error in commands") 3924 raise ValueError("Annotation failed: Error in commands") 3925 3926 # Update variants 3927 log.info(f"Annotation - Updating...") 3928 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3930 def annotation_exomiser(self, threads: int = None) -> None: 3931 """ 3932 This function annotate with Exomiser 3933 3934 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 3935 - "analysis" (dict/file): 3936 Full analysis dictionnary parameters (see Exomiser docs). 3937 Either a dict, or a file in JSON or YAML format. 3938 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 3939 Default : None 3940 - "preset" (string): 3941 Analysis preset (available in config folder). 3942 Used if no full "analysis" is provided. 3943 Default: "exome" 3944 - "phenopacket" (dict/file): 3945 Samples and phenotipic features parameters (see Exomiser docs). 3946 Either a dict, or a file in JSON or YAML format. 3947 Default: None 3948 - "subject" (dict): 3949 Sample parameters (see Exomiser docs). 3950 Example: 3951 "subject": 3952 { 3953 "id": "ISDBM322017", 3954 "sex": "FEMALE" 3955 } 3956 Default: None 3957 - "sample" (string): 3958 Sample name to construct "subject" section: 3959 "subject": 3960 { 3961 "id": "<sample>", 3962 "sex": "UNKNOWN_SEX" 3963 } 3964 Default: None 3965 - "phenotypicFeatures" (dict) 3966 Phenotypic features to construct "subject" section. 3967 Example: 3968 "phenotypicFeatures": 3969 [ 3970 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 3971 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 3972 ] 3973 - "hpo" (list) 3974 List of HPO ids as phenotypic features. 3975 Example: 3976 "hpo": ['0001156', '0001363', '0011304', '0010055'] 3977 Default: [] 3978 - "outputOptions" (dict): 3979 Output options (see Exomiser docs). 3980 Default: 3981 "output_options" = 3982 { 3983 "outputContributingVariantsOnly": False, 3984 "numGenes": 0, 3985 "outputFormats": ["TSV_VARIANT", "VCF"] 3986 } 3987 - "transcript_source" (string): 3988 Transcript source (either "refseq", "ucsc", "ensembl") 3989 Default: "refseq" 3990 - "exomiser_to_info" (boolean): 3991 Add exomiser TSV file columns as INFO fields in VCF. 3992 Default: False 3993 - "release" (string): 3994 Exomise database release. 3995 If not exists, database release will be downloaded (take a while). 3996 Default: None (provided by application.properties configuration file) 3997 - "exomiser_application_properties" (file): 3998 Exomiser configuration file (see Exomiser docs). 3999 Useful to automatically download databases (especially for specific genome databases). 4000 4001 Notes: 4002 - If no sample in parameters, first sample in VCF will be chosen 4003 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4004 4005 :param threads: The number of threads to use 4006 :return: None. 4007 """ 4008 4009 # DEBUG 4010 log.debug("Start annotation with Exomiser databases") 4011 4012 # Threads 4013 if not threads: 4014 threads = self.get_threads() 4015 log.debug("Threads: " + str(threads)) 4016 4017 # Config 4018 config = self.get_config() 4019 log.debug("Config: " + str(config)) 4020 4021 # Config - Folders - Databases 4022 databases_folders = ( 4023 config.get("folders", {}) 4024 .get("databases", {}) 4025 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4026 ) 4027 databases_folders = full_path(databases_folders) 4028 if not os.path.exists(databases_folders): 4029 log.error(f"Databases annotations: {databases_folders} NOT found") 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Config - Exomiser 4033 exomiser_bin_command = get_bin_command( 4034 bin="exomiser-cli*.jar", 4035 tool="exomiser", 4036 bin_type="jar", 4037 config=config, 4038 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4039 ) 4040 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4041 if not exomiser_bin_command: 4042 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4043 log.error(msg_err) 4044 raise ValueError(msg_err) 4045 4046 # Param 4047 param = self.get_param() 4048 log.debug("Param: " + str(param)) 4049 4050 # Param - Exomiser 4051 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4052 log.debug(f"Param Exomiser: {param_exomiser}") 4053 4054 # Param - Assembly 4055 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4056 log.debug("Assembly: " + str(assembly)) 4057 4058 # Data 4059 table_variants = self.get_table_variants() 4060 4061 # Check if not empty 4062 log.debug("Check if not empty") 4063 sql_query_chromosomes = ( 4064 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4065 ) 4066 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4067 log.info(f"VCF empty") 4068 return False 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Samples 4075 samples = self.get_header_sample_list() 4076 if not samples: 4077 log.error("No Samples in VCF") 4078 return False 4079 log.debug(f"Samples: {samples}") 4080 4081 # Memory limit 4082 memory_limit = self.get_memory("8G") 4083 log.debug(f"memory_limit: {memory_limit}") 4084 4085 # Exomiser java options 4086 exomiser_java_options = ( 4087 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4088 ) 4089 log.debug(f"Exomiser java options: {exomiser_java_options}") 4090 4091 # Download Exomiser (if not exists) 4092 exomiser_release = param_exomiser.get("release", None) 4093 exomiser_application_properties = param_exomiser.get( 4094 "exomiser_application_properties", None 4095 ) 4096 databases_download_exomiser( 4097 assemblies=[assembly], 4098 exomiser_folder=databases_folders, 4099 exomiser_release=exomiser_release, 4100 exomiser_phenotype_release=exomiser_release, 4101 exomiser_application_properties=exomiser_application_properties, 4102 ) 4103 4104 # Force annotation 4105 force_update_annotation = True 4106 4107 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4108 log.debug("Start annotation Exomiser") 4109 4110 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4111 4112 # tmp_dir = "/tmp/exomiser" 4113 4114 ### ANALYSIS ### 4115 ################ 4116 4117 # Create analysis.json through analysis dict 4118 # either analysis in param or by default 4119 # depending on preset exome/genome) 4120 4121 # Init analysis dict 4122 param_exomiser_analysis_dict = {} 4123 4124 # analysis from param 4125 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4126 param_exomiser_analysis = full_path(param_exomiser_analysis) 4127 4128 # If analysis in param -> load anlaysis json 4129 if param_exomiser_analysis: 4130 4131 # If param analysis is a file and exists 4132 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4133 param_exomiser_analysis 4134 ): 4135 # Load analysis file into analysis dict (either yaml or json) 4136 with open(param_exomiser_analysis) as json_file: 4137 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4138 4139 # If param analysis is a dict 4140 elif isinstance(param_exomiser_analysis, dict): 4141 # Load analysis dict into analysis dict (either yaml or json) 4142 param_exomiser_analysis_dict = param_exomiser_analysis 4143 4144 # Error analysis type 4145 else: 4146 log.error(f"Analysis type unknown. Check param file.") 4147 raise ValueError(f"Analysis type unknown. Check param file.") 4148 4149 # Case no input analysis config file/dict 4150 # Use preset (exome/genome) to open default config file 4151 if not param_exomiser_analysis_dict: 4152 4153 # default preset 4154 default_preset = "exome" 4155 4156 # Get param preset or default preset 4157 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4158 4159 # Try to find if preset is a file 4160 if os.path.exists(param_exomiser_preset): 4161 # Preset file is provided in full path 4162 param_exomiser_analysis_default_config_file = ( 4163 param_exomiser_preset 4164 ) 4165 # elif os.path.exists(full_path(param_exomiser_preset)): 4166 # # Preset file is provided in full path 4167 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4168 elif os.path.exists( 4169 os.path.join(folder_config, param_exomiser_preset) 4170 ): 4171 # Preset file is provided a basename in config folder (can be a path with subfolders) 4172 param_exomiser_analysis_default_config_file = os.path.join( 4173 folder_config, param_exomiser_preset 4174 ) 4175 else: 4176 # Construct preset file 4177 param_exomiser_analysis_default_config_file = os.path.join( 4178 folder_config, 4179 f"preset-{param_exomiser_preset}-analysis.json", 4180 ) 4181 4182 # If preset file exists 4183 param_exomiser_analysis_default_config_file = full_path( 4184 param_exomiser_analysis_default_config_file 4185 ) 4186 if os.path.exists(param_exomiser_analysis_default_config_file): 4187 # Load prest file into analysis dict (either yaml or json) 4188 with open( 4189 param_exomiser_analysis_default_config_file 4190 ) as json_file: 4191 # param_exomiser_analysis_dict[""] = json.load(json_file) 4192 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4193 json_file 4194 ) 4195 4196 # Error preset file 4197 else: 4198 log.error( 4199 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4200 ) 4201 raise ValueError( 4202 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4203 ) 4204 4205 # If no analysis dict created 4206 if not param_exomiser_analysis_dict: 4207 log.error(f"No analysis config") 4208 raise ValueError(f"No analysis config") 4209 4210 # Log 4211 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4212 4213 ### PHENOPACKET ### 4214 ################### 4215 4216 # If no PhenoPacket in analysis dict -> check in param 4217 if "phenopacket" not in param_exomiser_analysis_dict: 4218 4219 # If PhenoPacket in param -> load anlaysis json 4220 if param_exomiser.get("phenopacket", None): 4221 4222 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4223 param_exomiser_phenopacket = full_path( 4224 param_exomiser_phenopacket 4225 ) 4226 4227 # If param phenopacket is a file and exists 4228 if isinstance( 4229 param_exomiser_phenopacket, str 4230 ) and os.path.exists(param_exomiser_phenopacket): 4231 # Load phenopacket file into analysis dict (either yaml or json) 4232 with open(param_exomiser_phenopacket) as json_file: 4233 param_exomiser_analysis_dict["phenopacket"] = ( 4234 yaml.safe_load(json_file) 4235 ) 4236 4237 # If param phenopacket is a dict 4238 elif isinstance(param_exomiser_phenopacket, dict): 4239 # Load phenopacket dict into analysis dict (either yaml or json) 4240 param_exomiser_analysis_dict["phenopacket"] = ( 4241 param_exomiser_phenopacket 4242 ) 4243 4244 # Error phenopacket type 4245 else: 4246 log.error(f"Phenopacket type unknown. Check param file.") 4247 raise ValueError( 4248 f"Phenopacket type unknown. Check param file." 4249 ) 4250 4251 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4252 if "phenopacket" not in param_exomiser_analysis_dict: 4253 4254 # Init PhenoPacket 4255 param_exomiser_analysis_dict["phenopacket"] = { 4256 "id": "analysis", 4257 "proband": {}, 4258 } 4259 4260 ### Add subject ### 4261 4262 # If subject exists 4263 param_exomiser_subject = param_exomiser.get("subject", {}) 4264 4265 # If subject not exists -> found sample ID 4266 if not param_exomiser_subject: 4267 4268 # Found sample ID in param 4269 sample = param_exomiser.get("sample", None) 4270 4271 # Find sample ID (first sample) 4272 if not sample: 4273 sample_list = self.get_header_sample_list() 4274 if len(sample_list) > 0: 4275 sample = sample_list[0] 4276 else: 4277 log.error(f"No sample found") 4278 raise ValueError(f"No sample found") 4279 4280 # Create subject 4281 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4282 4283 # Add to dict 4284 param_exomiser_analysis_dict["phenopacket"][ 4285 "subject" 4286 ] = param_exomiser_subject 4287 4288 ### Add "phenotypicFeatures" ### 4289 4290 # If phenotypicFeatures exists 4291 param_exomiser_phenotypicfeatures = param_exomiser.get( 4292 "phenotypicFeatures", [] 4293 ) 4294 4295 # If phenotypicFeatures not exists -> Try to infer from hpo list 4296 if not param_exomiser_phenotypicfeatures: 4297 4298 # Found HPO in param 4299 param_exomiser_hpo = param_exomiser.get("hpo", []) 4300 4301 # Split HPO if list in string format separated by comma 4302 if isinstance(param_exomiser_hpo, str): 4303 param_exomiser_hpo = param_exomiser_hpo.split(",") 4304 4305 # Create HPO list 4306 for hpo in param_exomiser_hpo: 4307 hpo_clean = re.sub("[^0-9]", "", hpo) 4308 param_exomiser_phenotypicfeatures.append( 4309 { 4310 "type": { 4311 "id": f"HP:{hpo_clean}", 4312 "label": f"HP:{hpo_clean}", 4313 } 4314 } 4315 ) 4316 4317 # Add to dict 4318 param_exomiser_analysis_dict["phenopacket"][ 4319 "phenotypicFeatures" 4320 ] = param_exomiser_phenotypicfeatures 4321 4322 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4323 if not param_exomiser_phenotypicfeatures: 4324 for step in param_exomiser_analysis_dict.get( 4325 "analysis", {} 4326 ).get("steps", []): 4327 if "hiPhivePrioritiser" in step: 4328 param_exomiser_analysis_dict.get("analysis", {}).get( 4329 "steps", [] 4330 ).remove(step) 4331 4332 ### Add Input File ### 4333 4334 # Initial file name and htsFiles 4335 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4336 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4337 { 4338 "uri": tmp_vcf_name, 4339 "htsFormat": "VCF", 4340 "genomeAssembly": assembly, 4341 } 4342 ] 4343 4344 ### Add metaData ### 4345 4346 # If metaData not in analysis dict 4347 if "metaData" not in param_exomiser_analysis_dict: 4348 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4349 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4350 "createdBy": "howard", 4351 "phenopacketSchemaVersion": 1, 4352 } 4353 4354 ### OutputOptions ### 4355 4356 # Init output result folder 4357 output_results = os.path.join(tmp_dir, "results") 4358 4359 # If no outputOptions in analysis dict 4360 if "outputOptions" not in param_exomiser_analysis_dict: 4361 4362 # default output formats 4363 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4364 4365 # Get outputOptions in param 4366 output_options = param_exomiser.get("outputOptions", None) 4367 4368 # If no output_options in param -> check 4369 if not output_options: 4370 output_options = { 4371 "outputContributingVariantsOnly": False, 4372 "numGenes": 0, 4373 "outputFormats": defaut_output_formats, 4374 } 4375 4376 # Replace outputDirectory in output options 4377 output_options["outputDirectory"] = output_results 4378 output_options["outputFileName"] = "howard" 4379 4380 # Add outputOptions in analysis dict 4381 param_exomiser_analysis_dict["outputOptions"] = output_options 4382 4383 else: 4384 4385 # Replace output_results and output format (if exists in param) 4386 param_exomiser_analysis_dict["outputOptions"][ 4387 "outputDirectory" 4388 ] = output_results 4389 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4390 list( 4391 set( 4392 param_exomiser_analysis_dict.get( 4393 "outputOptions", {} 4394 ).get("outputFormats", []) 4395 + ["TSV_VARIANT", "VCF"] 4396 ) 4397 ) 4398 ) 4399 4400 # log 4401 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4402 4403 ### ANALYSIS FILE ### 4404 ##################### 4405 4406 ### Full JSON analysis config file ### 4407 4408 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4409 with open(exomiser_analysis, "w") as fp: 4410 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4411 4412 ### SPLIT analysis and sample config files 4413 4414 # Splitted analysis dict 4415 param_exomiser_analysis_dict_for_split = ( 4416 param_exomiser_analysis_dict.copy() 4417 ) 4418 4419 # Phenopacket JSON file 4420 exomiser_analysis_phenopacket = os.path.join( 4421 tmp_dir, "analysis_phenopacket.json" 4422 ) 4423 with open(exomiser_analysis_phenopacket, "w") as fp: 4424 json.dump( 4425 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4426 fp, 4427 indent=4, 4428 ) 4429 4430 # Analysis JSON file without Phenopacket parameters 4431 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4432 exomiser_analysis_analysis = os.path.join( 4433 tmp_dir, "analysis_analysis.json" 4434 ) 4435 with open(exomiser_analysis_analysis, "w") as fp: 4436 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4437 4438 ### INITAL VCF file ### 4439 ####################### 4440 4441 ### Create list of samples to use and include inti initial VCF file #### 4442 4443 # Subject (main sample) 4444 # Get sample ID in analysis dict 4445 sample_subject = ( 4446 param_exomiser_analysis_dict.get("phenopacket", {}) 4447 .get("subject", {}) 4448 .get("id", None) 4449 ) 4450 sample_proband = ( 4451 param_exomiser_analysis_dict.get("phenopacket", {}) 4452 .get("proband", {}) 4453 .get("subject", {}) 4454 .get("id", None) 4455 ) 4456 sample = [] 4457 if sample_subject: 4458 sample.append(sample_subject) 4459 if sample_proband: 4460 sample.append(sample_proband) 4461 4462 # Get sample ID within Pedigree 4463 pedigree_persons_list = ( 4464 param_exomiser_analysis_dict.get("phenopacket", {}) 4465 .get("pedigree", {}) 4466 .get("persons", {}) 4467 ) 4468 4469 # Create list with all sample ID in pedigree (if exists) 4470 pedigree_persons = [] 4471 for person in pedigree_persons_list: 4472 pedigree_persons.append(person.get("individualId")) 4473 4474 # Concat subject sample ID and samples ID in pedigreesamples 4475 samples = list(set(sample + pedigree_persons)) 4476 4477 # Check if sample list is not empty 4478 if not samples: 4479 log.error(f"No samples found") 4480 raise ValueError(f"No samples found") 4481 4482 # Create VCF with sample (either sample in param or first one by default) 4483 # Export VCF file 4484 self.export_variant_vcf( 4485 vcf_file=tmp_vcf_name, 4486 remove_info=True, 4487 add_samples=True, 4488 list_samples=samples, 4489 index=False, 4490 ) 4491 4492 ### Execute Exomiser ### 4493 ######################## 4494 4495 # Init command 4496 exomiser_command = "" 4497 4498 # Command exomiser options 4499 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4500 4501 # Release 4502 exomiser_release = param_exomiser.get("release", None) 4503 if exomiser_release: 4504 # phenotype data version 4505 exomiser_options += ( 4506 f" --exomiser.phenotype.data-version={exomiser_release} " 4507 ) 4508 # data version 4509 exomiser_options += ( 4510 f" --exomiser.{assembly}.data-version={exomiser_release} " 4511 ) 4512 # variant white list 4513 variant_white_list_file = ( 4514 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4515 ) 4516 if os.path.exists( 4517 os.path.join( 4518 databases_folders, assembly, variant_white_list_file 4519 ) 4520 ): 4521 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4522 4523 # transcript_source 4524 transcript_source = param_exomiser.get( 4525 "transcript_source", None 4526 ) # ucsc, refseq, ensembl 4527 if transcript_source: 4528 exomiser_options += ( 4529 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4530 ) 4531 4532 # If analysis contain proband param 4533 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4534 "proband", {} 4535 ): 4536 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4537 4538 # If no proband (usually uniq sample) 4539 else: 4540 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 4541 4542 # Log 4543 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 4544 4545 # Run command 4546 result = subprocess.call( 4547 exomiser_command_analysis.split(), stdout=subprocess.PIPE 4548 ) 4549 if result: 4550 log.error("Exomiser command failed") 4551 raise ValueError("Exomiser command failed") 4552 4553 ### RESULTS ### 4554 ############### 4555 4556 ### Annotate with TSV fields ### 4557 4558 # Init result tsv file 4559 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 4560 4561 # Init result tsv file 4562 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 4563 4564 # Parse TSV file and explode columns in INFO field 4565 if exomiser_to_info and os.path.exists(output_results_tsv): 4566 4567 # Log 4568 log.debug("Exomiser columns to VCF INFO field") 4569 4570 # Retrieve columns and types 4571 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 4572 output_results_tsv_df = self.get_query_to_df(query) 4573 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 4574 4575 # Init concat fields for update 4576 sql_query_update_concat_fields = [] 4577 4578 # Fields to avoid 4579 fields_to_avoid = [ 4580 "CONTIG", 4581 "START", 4582 "END", 4583 "REF", 4584 "ALT", 4585 "QUAL", 4586 "FILTER", 4587 "GENOTYPE", 4588 ] 4589 4590 # List all columns to add into header 4591 for header_column in output_results_tsv_columns: 4592 4593 # If header column is enable 4594 if header_column not in fields_to_avoid: 4595 4596 # Header info type 4597 header_info_type = "String" 4598 header_column_df = output_results_tsv_df[header_column] 4599 header_column_df_dtype = header_column_df.dtype 4600 if header_column_df_dtype == object: 4601 if ( 4602 pd.to_numeric(header_column_df, errors="coerce") 4603 .notnull() 4604 .all() 4605 ): 4606 header_info_type = "Float" 4607 else: 4608 header_info_type = "Integer" 4609 4610 # Header info 4611 characters_to_validate = ["-"] 4612 pattern = "[" + "".join(characters_to_validate) + "]" 4613 header_info_name = re.sub( 4614 pattern, 4615 "_", 4616 f"Exomiser_{header_column}".replace("#", ""), 4617 ) 4618 header_info_number = "." 4619 header_info_description = ( 4620 f"Exomiser {header_column} annotation" 4621 ) 4622 header_info_source = "Exomiser" 4623 header_info_version = "unknown" 4624 header_info_code = CODE_TYPE_MAP[header_info_type] 4625 vcf_reader.infos[header_info_name] = vcf.parser._Info( 4626 header_info_name, 4627 header_info_number, 4628 header_info_type, 4629 header_info_description, 4630 header_info_source, 4631 header_info_version, 4632 header_info_code, 4633 ) 4634 4635 # Add field to add for update to concat fields 4636 sql_query_update_concat_fields.append( 4637 f""" 4638 CASE 4639 WHEN table_parquet."{header_column}" NOT IN ('','.') 4640 THEN concat( 4641 '{header_info_name}=', 4642 table_parquet."{header_column}", 4643 ';' 4644 ) 4645 4646 ELSE '' 4647 END 4648 """ 4649 ) 4650 4651 # Update query 4652 sql_query_update = f""" 4653 UPDATE {table_variants} as table_variants 4654 SET INFO = concat( 4655 CASE 4656 WHEN INFO NOT IN ('', '.') 4657 THEN INFO 4658 ELSE '' 4659 END, 4660 CASE 4661 WHEN table_variants.INFO NOT IN ('','.') 4662 THEN ';' 4663 ELSE '' 4664 END, 4665 ( 4666 SELECT 4667 concat( 4668 {",".join(sql_query_update_concat_fields)} 4669 ) 4670 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 4671 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 4672 AND table_parquet.\"START\" = table_variants.\"POS\" 4673 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 4674 AND table_parquet.\"REF\" = table_variants.\"REF\" 4675 ) 4676 ) 4677 ; 4678 """ 4679 4680 # Update 4681 self.conn.execute(sql_query_update) 4682 4683 ### Annotate with VCF INFO field ### 4684 4685 # Init result VCF file 4686 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 4687 4688 # If VCF exists 4689 if os.path.exists(output_results_vcf): 4690 4691 # Log 4692 log.debug("Exomiser result VCF update variants") 4693 4694 # Find Exomiser INFO field annotation in header 4695 with gzip.open(output_results_vcf, "rt") as f: 4696 header_list = self.read_vcf_header(f) 4697 exomiser_vcf_header = vcf.Reader( 4698 io.StringIO("\n".join(header_list)) 4699 ) 4700 4701 # Add annotation INFO field to header 4702 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 4703 4704 # Update variants with VCF 4705 self.update_from_vcf(output_results_vcf) 4706 4707 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
4709 def annotation_snpeff(self, threads: int = None) -> None: 4710 """ 4711 This function annotate with snpEff 4712 4713 :param threads: The number of threads to use 4714 :return: the value of the variable "return_value". 4715 """ 4716 4717 # DEBUG 4718 log.debug("Start annotation with snpeff databases") 4719 4720 # Threads 4721 if not threads: 4722 threads = self.get_threads() 4723 log.debug("Threads: " + str(threads)) 4724 4725 # DEBUG 4726 delete_tmp = True 4727 if self.get_config().get("verbosity", "warning") in ["debug"]: 4728 delete_tmp = False 4729 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4730 4731 # Config 4732 config = self.get_config() 4733 log.debug("Config: " + str(config)) 4734 4735 # Config - Folders - Databases 4736 databases_folders = ( 4737 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 4738 ) 4739 log.debug("Databases annotations: " + str(databases_folders)) 4740 4741 # # Config - Java 4742 # java_bin = get_bin( 4743 # tool="java", 4744 # bin="java", 4745 # bin_type="bin", 4746 # config=config, 4747 # default_folder="/usr/bin", 4748 # ) 4749 # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))): 4750 # log.error(f"Annotation failed: no java bin '{java_bin}'") 4751 # raise ValueError(f"Annotation failed: no java bin '{java_bin}'") 4752 4753 # # Config - snpEff bin 4754 # snpeff_jar = get_bin( 4755 # tool="snpeff", 4756 # bin="snpEff.jar", 4757 # bin_type="jar", 4758 # config=config, 4759 # default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4760 # ) 4761 # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))): 4762 # log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4763 # raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'") 4764 4765 # Config - snpEff bin command 4766 snpeff_bin_command = get_bin_command( 4767 bin="snpEff.jar", 4768 tool="snpeff", 4769 bin_type="jar", 4770 config=config, 4771 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 4772 ) 4773 if not snpeff_bin_command: 4774 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 4775 log.error(msg_err) 4776 raise ValueError(msg_err) 4777 4778 # Config - snpEff databases 4779 snpeff_databases = ( 4780 config.get("folders", {}) 4781 .get("databases", {}) 4782 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 4783 ) 4784 snpeff_databases = full_path(snpeff_databases) 4785 if snpeff_databases is not None and snpeff_databases != "": 4786 log.debug(f"Create snpEff databases folder") 4787 if not os.path.exists(snpeff_databases): 4788 os.makedirs(snpeff_databases) 4789 4790 # Param 4791 param = self.get_param() 4792 log.debug("Param: " + str(param)) 4793 4794 # Param 4795 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 4796 log.debug("Options: " + str(options)) 4797 4798 # Param - Assembly 4799 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4800 4801 # Param - Options 4802 snpeff_options = ( 4803 param.get("annotation", {}).get("snpeff", {}).get("options", "") 4804 ) 4805 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 4806 snpeff_csvstats = ( 4807 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 4808 ) 4809 if snpeff_stats: 4810 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 4811 snpeff_stats = full_path(snpeff_stats) 4812 snpeff_options += f" -stats {snpeff_stats}" 4813 if snpeff_csvstats: 4814 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 4815 snpeff_csvstats = full_path(snpeff_csvstats) 4816 snpeff_options += f" -csvStats {snpeff_csvstats}" 4817 4818 # Data 4819 table_variants = self.get_table_variants() 4820 4821 # Check if not empty 4822 log.debug("Check if not empty") 4823 sql_query_chromosomes = ( 4824 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4825 ) 4826 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 4827 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4828 log.info(f"VCF empty") 4829 return 4830 4831 # Export in VCF 4832 log.debug("Create initial file to annotate") 4833 tmp_vcf = NamedTemporaryFile( 4834 prefix=self.get_prefix(), 4835 dir=self.get_tmp_dir(), 4836 suffix=".vcf.gz", 4837 delete=True, 4838 ) 4839 tmp_vcf_name = tmp_vcf.name 4840 4841 # VCF header 4842 vcf_reader = self.get_header() 4843 log.debug("Initial header: " + str(vcf_reader.infos)) 4844 4845 # Existing annotations 4846 for vcf_annotation in self.get_header().infos: 4847 4848 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4849 log.debug( 4850 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4851 ) 4852 4853 # Memory limit 4854 # if config.get("memory", None): 4855 # memory_limit = config.get("memory", "8G") 4856 # else: 4857 # memory_limit = "8G" 4858 memory_limit = self.get_memory("8G") 4859 log.debug(f"memory_limit: {memory_limit}") 4860 4861 # snpEff java options 4862 snpeff_java_options = ( 4863 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4864 ) 4865 log.debug(f"Exomiser java options: {snpeff_java_options}") 4866 4867 force_update_annotation = True 4868 4869 if "ANN" not in self.get_header().infos or force_update_annotation: 4870 4871 # Check snpEff database 4872 log.debug(f"Check snpEff databases {[assembly]}") 4873 databases_download_snpeff( 4874 folder=snpeff_databases, assemblies=[assembly], config=config 4875 ) 4876 4877 # Export VCF file 4878 self.export_variant_vcf( 4879 vcf_file=tmp_vcf_name, 4880 remove_info=True, 4881 add_samples=False, 4882 index=True, 4883 ) 4884 4885 # Tmp file 4886 err_files = [] 4887 tmp_annotate_vcf = NamedTemporaryFile( 4888 prefix=self.get_prefix(), 4889 dir=self.get_tmp_dir(), 4890 suffix=".vcf", 4891 delete=False, 4892 ) 4893 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4894 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4895 err_files.append(tmp_annotate_vcf_name_err) 4896 4897 # Command 4898 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 4899 log.debug(f"Annotation - snpEff command: {snpeff_command}") 4900 run_parallel_commands([snpeff_command], 1) 4901 4902 # Error messages 4903 log.info(f"Error/Warning messages:") 4904 error_message_command_all = [] 4905 error_message_command_warning = [] 4906 error_message_command_err = [] 4907 for err_file in err_files: 4908 with open(err_file, "r") as f: 4909 for line in f: 4910 message = line.strip() 4911 error_message_command_all.append(message) 4912 if line.startswith("[W::"): 4913 error_message_command_warning.append(message) 4914 if line.startswith("[E::"): 4915 error_message_command_err.append(f"{err_file}: " + message) 4916 # log info 4917 for message in list( 4918 set(error_message_command_err + error_message_command_warning) 4919 ): 4920 log.info(f" {message}") 4921 # debug info 4922 for message in list(set(error_message_command_all)): 4923 log.debug(f" {message}") 4924 # failed 4925 if len(error_message_command_err): 4926 log.error("Annotation failed: Error in commands") 4927 raise ValueError("Annotation failed: Error in commands") 4928 4929 # Find annotation in header 4930 with open(tmp_annotate_vcf_name, "rt") as f: 4931 header_list = self.read_vcf_header(f) 4932 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 4933 4934 for ann in annovar_vcf_header.infos: 4935 if ann not in self.get_header().infos: 4936 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 4937 4938 # Update variants 4939 log.info(f"Annotation - Updating...") 4940 self.update_from_vcf(tmp_annotate_vcf_name) 4941 4942 else: 4943 if "ANN" in self.get_header().infos: 4944 log.debug(f"Existing snpEff annotations in VCF") 4945 if force_update_annotation: 4946 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
4948 def annotation_annovar(self, threads: int = None) -> None: 4949 """ 4950 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 4951 annotations 4952 4953 :param threads: number of threads to use 4954 :return: the value of the variable "return_value". 4955 """ 4956 4957 # DEBUG 4958 log.debug("Start annotation with Annovar databases") 4959 4960 # Threads 4961 if not threads: 4962 threads = self.get_threads() 4963 log.debug("Threads: " + str(threads)) 4964 4965 # Tmp en Err files 4966 tmp_files = [] 4967 err_files = [] 4968 4969 # DEBUG 4970 delete_tmp = True 4971 if self.get_config().get("verbosity", "warning") in ["debug"]: 4972 delete_tmp = False 4973 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4974 4975 # Config 4976 config = self.get_config() 4977 log.debug("Config: " + str(config)) 4978 4979 # Config - Folders - Databases 4980 databases_folders = ( 4981 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 4982 ) 4983 log.debug("Databases annotations: " + str(databases_folders)) 4984 4985 # Config - annovar bin command 4986 annovar_bin_command = get_bin_command( 4987 bin="table_annovar.pl", 4988 tool="annovar", 4989 bin_type="perl", 4990 config=config, 4991 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 4992 ) 4993 if not annovar_bin_command: 4994 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 4995 log.error(msg_err) 4996 raise ValueError(msg_err) 4997 4998 # Config - BCFTools bin command 4999 bcftools_bin_command = get_bin_command( 5000 bin="bcftools", 5001 tool="bcftools", 5002 bin_type="bin", 5003 config=config, 5004 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5005 ) 5006 if not bcftools_bin_command: 5007 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5008 log.error(msg_err) 5009 raise ValueError(msg_err) 5010 5011 # Config - annovar databases 5012 annovar_databases = ( 5013 config.get("folders", {}) 5014 .get("databases", {}) 5015 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5016 ) 5017 annovar_databases = full_path(annovar_databases) 5018 if annovar_databases != "" and not os.path.exists(annovar_databases): 5019 os.makedirs(annovar_databases) 5020 5021 # Param 5022 param = self.get_param() 5023 log.debug("Param: " + str(param)) 5024 5025 # Param - options 5026 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5027 log.debug("Options: " + str(options)) 5028 5029 # Param - annotations 5030 annotations = ( 5031 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5032 ) 5033 log.debug("Annotations: " + str(annotations)) 5034 5035 # Param - Assembly 5036 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5037 5038 # Annovar database assembly 5039 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5040 if annovar_databases_assembly != "" and not os.path.exists( 5041 annovar_databases_assembly 5042 ): 5043 os.makedirs(annovar_databases_assembly) 5044 5045 # Data 5046 table_variants = self.get_table_variants() 5047 5048 # Check if not empty 5049 log.debug("Check if not empty") 5050 sql_query_chromosomes = ( 5051 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5052 ) 5053 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5054 if not sql_query_chromosomes_df["count"][0]: 5055 log.info(f"VCF empty") 5056 return 5057 5058 # VCF header 5059 vcf_reader = self.get_header() 5060 log.debug("Initial header: " + str(vcf_reader.infos)) 5061 5062 # Existing annotations 5063 for vcf_annotation in self.get_header().infos: 5064 5065 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5066 log.debug( 5067 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5068 ) 5069 5070 force_update_annotation = True 5071 5072 if annotations: 5073 5074 commands = [] 5075 tmp_annotates_vcf_name_list = [] 5076 5077 # Export in VCF 5078 log.debug("Create initial file to annotate") 5079 tmp_vcf = NamedTemporaryFile( 5080 prefix=self.get_prefix(), 5081 dir=self.get_tmp_dir(), 5082 suffix=".vcf.gz", 5083 delete=False, 5084 ) 5085 tmp_vcf_name = tmp_vcf.name 5086 tmp_files.append(tmp_vcf_name) 5087 tmp_files.append(tmp_vcf_name + ".tbi") 5088 5089 # Export VCF file 5090 self.export_variant_vcf( 5091 vcf_file=tmp_vcf_name, 5092 remove_info=".", 5093 add_samples=False, 5094 index=True, 5095 ) 5096 5097 # Create file for field rename 5098 log.debug("Create file for field rename") 5099 tmp_rename = NamedTemporaryFile( 5100 prefix=self.get_prefix(), 5101 dir=self.get_tmp_dir(), 5102 suffix=".rename", 5103 delete=False, 5104 ) 5105 tmp_rename_name = tmp_rename.name 5106 tmp_files.append(tmp_rename_name) 5107 5108 # Check Annovar database 5109 log.debug( 5110 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5111 ) 5112 databases_download_annovar( 5113 folder=annovar_databases, 5114 files=list(annotations.keys()), 5115 assemblies=[assembly], 5116 ) 5117 5118 for annotation in annotations: 5119 annotation_fields = annotations[annotation] 5120 5121 if not annotation_fields: 5122 annotation_fields = {"INFO": None} 5123 5124 log.info(f"Annotations Annovar - database '{annotation}'") 5125 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5126 5127 # Tmp file for annovar 5128 err_files = [] 5129 tmp_annotate_vcf_directory = TemporaryDirectory( 5130 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5131 ) 5132 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5133 tmp_annotate_vcf_name_annovar = ( 5134 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5135 ) 5136 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5137 err_files.append(tmp_annotate_vcf_name_err) 5138 tmp_files.append(tmp_annotate_vcf_name_err) 5139 5140 # Tmp file final vcf annotated by annovar 5141 tmp_annotate_vcf = NamedTemporaryFile( 5142 prefix=self.get_prefix(), 5143 dir=self.get_tmp_dir(), 5144 suffix=".vcf.gz", 5145 delete=False, 5146 ) 5147 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5148 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5149 tmp_files.append(tmp_annotate_vcf_name) 5150 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5151 5152 # Number of fields 5153 annotation_list = [] 5154 annotation_renamed_list = [] 5155 5156 for annotation_field in annotation_fields: 5157 5158 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5159 annotation_fields_new_name = annotation_fields.get( 5160 annotation_field, annotation_field 5161 ) 5162 if not annotation_fields_new_name: 5163 annotation_fields_new_name = annotation_field 5164 5165 if ( 5166 force_update_annotation 5167 or annotation_fields_new_name not in self.get_header().infos 5168 ): 5169 annotation_list.append(annotation_field) 5170 annotation_renamed_list.append(annotation_fields_new_name) 5171 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5172 log.warning( 5173 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5174 ) 5175 5176 # Add rename info 5177 run_parallel_commands( 5178 [ 5179 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5180 ], 5181 1, 5182 ) 5183 5184 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5185 log.debug("annotation_list: " + str(annotation_list)) 5186 5187 # protocol 5188 protocol = annotation 5189 5190 # argument 5191 argument = "" 5192 5193 # operation 5194 operation = "f" 5195 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5196 "ensGene" 5197 ): 5198 operation = "g" 5199 if options.get("genebase", None): 5200 argument = f"""'{options.get("genebase","")}'""" 5201 elif annotation in ["cytoBand"]: 5202 operation = "r" 5203 5204 # argument option 5205 argument_option = "" 5206 if argument != "": 5207 argument_option = " --argument " + argument 5208 5209 # command options 5210 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5211 for option in options: 5212 if option not in ["genebase"]: 5213 command_options += f""" --{option}={options[option]}""" 5214 5215 # Command 5216 5217 # Command - Annovar 5218 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5219 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5220 5221 # Command - start pipe 5222 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5223 5224 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5225 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5226 5227 # Command - Special characters (refGene annotation) 5228 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5229 5230 # Command - Clean empty fields (with value ".") 5231 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5232 5233 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5234 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5235 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5236 # for ann in annotation_renamed_list: 5237 for ann in annotation_list: 5238 annovar_fields_to_keep.append(f"^INFO/{ann}") 5239 5240 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5241 5242 # Command - indexing 5243 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5244 5245 log.debug(f"Annotation - Annovar command: {command_annovar}") 5246 run_parallel_commands([command_annovar], 1) 5247 5248 # Error messages 5249 log.info(f"Error/Warning messages:") 5250 error_message_command_all = [] 5251 error_message_command_warning = [] 5252 error_message_command_err = [] 5253 for err_file in err_files: 5254 with open(err_file, "r") as f: 5255 for line in f: 5256 message = line.strip() 5257 error_message_command_all.append(message) 5258 if line.startswith("[W::") or line.startswith("WARNING"): 5259 error_message_command_warning.append(message) 5260 if line.startswith("[E::") or line.startswith("ERROR"): 5261 error_message_command_err.append( 5262 f"{err_file}: " + message 5263 ) 5264 # log info 5265 for message in list( 5266 set(error_message_command_err + error_message_command_warning) 5267 ): 5268 log.info(f" {message}") 5269 # debug info 5270 for message in list(set(error_message_command_all)): 5271 log.debug(f" {message}") 5272 # failed 5273 if len(error_message_command_err): 5274 log.error("Annotation failed: Error in commands") 5275 raise ValueError("Annotation failed: Error in commands") 5276 5277 if tmp_annotates_vcf_name_list: 5278 5279 # List of annotated files 5280 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5281 5282 # Tmp file 5283 tmp_annotate_vcf = NamedTemporaryFile( 5284 prefix=self.get_prefix(), 5285 dir=self.get_tmp_dir(), 5286 suffix=".vcf.gz", 5287 delete=False, 5288 ) 5289 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5290 tmp_files.append(tmp_annotate_vcf_name) 5291 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5292 err_files.append(tmp_annotate_vcf_name_err) 5293 tmp_files.append(tmp_annotate_vcf_name_err) 5294 5295 # Command merge 5296 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5297 log.info( 5298 f"Annotation Annovar - Annotation merging " 5299 + str(len(tmp_annotates_vcf_name_list)) 5300 + " annotated files" 5301 ) 5302 log.debug(f"Annotation - merge command: {merge_command}") 5303 run_parallel_commands([merge_command], 1) 5304 5305 # Find annotation in header 5306 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5307 header_list = self.read_vcf_header(f) 5308 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5309 5310 for ann in annovar_vcf_header.infos: 5311 if ann not in self.get_header().infos: 5312 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5313 5314 # Update variants 5315 log.info(f"Annotation Annovar - Updating...") 5316 self.update_from_vcf(tmp_annotate_vcf_name) 5317 5318 # Clean files 5319 # Tmp file remove command 5320 if True: 5321 tmp_files_remove_command = "" 5322 if tmp_files: 5323 tmp_files_remove_command = " ".join(tmp_files) 5324 clean_command = f" rm -f {tmp_files_remove_command} " 5325 log.debug(f"Annotation Annovar - Annotation cleaning ") 5326 log.debug(f"Annotation - cleaning command: {clean_command}") 5327 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5330 def annotation_parquet(self, threads: int = None) -> None: 5331 """ 5332 It takes a VCF file, and annotates it with a parquet file 5333 5334 :param threads: number of threads to use for the annotation 5335 :return: the value of the variable "result". 5336 """ 5337 5338 # DEBUG 5339 log.debug("Start annotation with parquet databases") 5340 5341 # Threads 5342 if not threads: 5343 threads = self.get_threads() 5344 log.debug("Threads: " + str(threads)) 5345 5346 # DEBUG 5347 delete_tmp = True 5348 if self.get_config().get("verbosity", "warning") in ["debug"]: 5349 delete_tmp = False 5350 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5351 5352 # Config 5353 databases_folders = set( 5354 self.get_config() 5355 .get("folders", {}) 5356 .get("databases", {}) 5357 .get("annotations", ["."]) 5358 + self.get_config() 5359 .get("folders", {}) 5360 .get("databases", {}) 5361 .get("parquet", ["."]) 5362 ) 5363 log.debug("Databases annotations: " + str(databases_folders)) 5364 5365 # Param 5366 annotations = ( 5367 self.get_param() 5368 .get("annotation", {}) 5369 .get("parquet", {}) 5370 .get("annotations", None) 5371 ) 5372 log.debug("Annotations: " + str(annotations)) 5373 5374 # Assembly 5375 assembly = self.get_param().get( 5376 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5377 ) 5378 5379 # Force Update Annotation 5380 force_update_annotation = ( 5381 self.get_param() 5382 .get("annotation", {}) 5383 .get("options", {}) 5384 .get("annotations_update", False) 5385 ) 5386 log.debug(f"force_update_annotation={force_update_annotation}") 5387 force_append_annotation = ( 5388 self.get_param() 5389 .get("annotation", {}) 5390 .get("options", {}) 5391 .get("annotations_append", False) 5392 ) 5393 log.debug(f"force_append_annotation={force_append_annotation}") 5394 5395 # Data 5396 table_variants = self.get_table_variants() 5397 5398 # Check if not empty 5399 log.debug("Check if not empty") 5400 sql_query_chromosomes_df = self.get_query_to_df( 5401 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5402 ) 5403 if not sql_query_chromosomes_df["count"][0]: 5404 log.info(f"VCF empty") 5405 return 5406 5407 # VCF header 5408 vcf_reader = self.get_header() 5409 log.debug("Initial header: " + str(vcf_reader.infos)) 5410 5411 # Nb Variants POS 5412 log.debug("NB Variants Start") 5413 nb_variants = self.conn.execute( 5414 f"SELECT count(*) AS count FROM variants" 5415 ).fetchdf()["count"][0] 5416 log.debug("NB Variants Stop") 5417 5418 # Existing annotations 5419 for vcf_annotation in self.get_header().infos: 5420 5421 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5422 log.debug( 5423 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5424 ) 5425 5426 # Added columns 5427 added_columns = [] 5428 5429 # drop indexes 5430 log.debug(f"Drop indexes...") 5431 self.drop_indexes() 5432 5433 if annotations: 5434 5435 if "ALL" in annotations: 5436 5437 all_param = annotations.get("ALL", {}) 5438 all_param_formats = all_param.get("formats", None) 5439 all_param_releases = all_param.get("releases", None) 5440 5441 databases_infos_dict = self.scan_databases( 5442 database_formats=all_param_formats, 5443 database_releases=all_param_releases, 5444 ) 5445 for database_infos in databases_infos_dict.keys(): 5446 if database_infos not in annotations: 5447 annotations[database_infos] = {"INFO": None} 5448 5449 for annotation in annotations: 5450 5451 if annotation in ["ALL"]: 5452 continue 5453 5454 # Annotation Name 5455 annotation_name = os.path.basename(annotation) 5456 5457 # Annotation fields 5458 annotation_fields = annotations[annotation] 5459 if not annotation_fields: 5460 annotation_fields = {"INFO": None} 5461 5462 log.debug(f"Annotation '{annotation_name}'") 5463 log.debug( 5464 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5465 ) 5466 5467 # Create Database 5468 database = Database( 5469 database=annotation, 5470 databases_folders=databases_folders, 5471 assembly=assembly, 5472 ) 5473 5474 # Find files 5475 parquet_file = database.get_database() 5476 parquet_hdr_file = database.get_header_file() 5477 parquet_type = database.get_type() 5478 5479 # Check if files exists 5480 if not parquet_file or not parquet_hdr_file: 5481 log.error("Annotation failed: file not found") 5482 raise ValueError("Annotation failed: file not found") 5483 else: 5484 # Get parquet connexion 5485 parquet_sql_attach = database.get_sql_database_attach( 5486 output="query" 5487 ) 5488 if parquet_sql_attach: 5489 self.conn.execute(parquet_sql_attach) 5490 parquet_file_link = database.get_sql_database_link() 5491 # Log 5492 log.debug( 5493 f"Annotation '{annotation_name}' - file: " 5494 + str(parquet_file) 5495 + " and " 5496 + str(parquet_hdr_file) 5497 ) 5498 5499 # Database full header columns 5500 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5501 parquet_hdr_file 5502 ) 5503 # Log 5504 log.debug( 5505 "Annotation database header columns : " 5506 + str(parquet_hdr_vcf_header_columns) 5507 ) 5508 5509 # Load header as VCF object 5510 parquet_hdr_vcf_header_infos = database.get_header().infos 5511 # Log 5512 log.debug( 5513 "Annotation database header: " 5514 + str(parquet_hdr_vcf_header_infos) 5515 ) 5516 5517 # Get extra infos 5518 parquet_columns = database.get_extra_columns() 5519 # Log 5520 log.debug("Annotation database Columns: " + str(parquet_columns)) 5521 5522 # Add extra columns if "ALL" in annotation_fields 5523 # if "ALL" in annotation_fields: 5524 # allow_add_extra_column = True 5525 if "ALL" in annotation_fields and database.get_extra_columns(): 5526 for extra_column in database.get_extra_columns(): 5527 if ( 5528 extra_column not in annotation_fields 5529 and extra_column.replace("INFO/", "") 5530 not in parquet_hdr_vcf_header_infos 5531 ): 5532 parquet_hdr_vcf_header_infos[extra_column] = ( 5533 vcf.parser._Info( 5534 extra_column, 5535 ".", 5536 "String", 5537 f"{extra_column} description", 5538 "unknown", 5539 "unknown", 5540 self.code_type_map["String"], 5541 ) 5542 ) 5543 5544 # For all fields in database 5545 annotation_fields_all = False 5546 if "ALL" in annotation_fields or "INFO" in annotation_fields: 5547 annotation_fields_all = True 5548 annotation_fields = { 5549 key: key for key in parquet_hdr_vcf_header_infos 5550 } 5551 5552 log.debug( 5553 "Annotation database header - All annotations added: " 5554 + str(annotation_fields) 5555 ) 5556 5557 # Init 5558 5559 # List of annotation fields to use 5560 sql_query_annotation_update_info_sets = [] 5561 5562 # List of annotation to agregate 5563 sql_query_annotation_to_agregate = [] 5564 5565 # Number of fields 5566 nb_annotation_field = 0 5567 5568 # Annotation fields processed 5569 annotation_fields_processed = [] 5570 5571 # Columns mapping 5572 map_columns = database.map_columns( 5573 columns=annotation_fields, prefixes=["INFO/"] 5574 ) 5575 5576 # Query dict for fields to remove (update option) 5577 query_dict_remove = {} 5578 5579 # Fetch Anotation fields 5580 for annotation_field in annotation_fields: 5581 5582 # annotation_field_column 5583 annotation_field_column = map_columns.get( 5584 annotation_field, "INFO" 5585 ) 5586 5587 # field new name, if parametered 5588 annotation_fields_new_name = annotation_fields.get( 5589 annotation_field, annotation_field 5590 ) 5591 if not annotation_fields_new_name: 5592 annotation_fields_new_name = annotation_field 5593 5594 # To annotate 5595 # force_update_annotation = True 5596 # force_append_annotation = True 5597 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 5598 if annotation_field in parquet_hdr_vcf_header_infos and ( 5599 force_update_annotation 5600 or force_append_annotation 5601 or ( 5602 annotation_fields_new_name 5603 not in self.get_header().infos 5604 ) 5605 ): 5606 5607 # Add field to annotation to process list 5608 annotation_fields_processed.append( 5609 annotation_fields_new_name 5610 ) 5611 5612 # explode infos for the field 5613 annotation_fields_new_name_info_msg = "" 5614 if ( 5615 force_update_annotation 5616 and annotation_fields_new_name 5617 in self.get_header().infos 5618 ): 5619 # Remove field from INFO 5620 query = f""" 5621 UPDATE {table_variants} as table_variants 5622 SET INFO = REGEXP_REPLACE( 5623 concat(table_variants.INFO,''), 5624 ';*{annotation_fields_new_name}=[^;]*', 5625 '' 5626 ) 5627 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 5628 """ 5629 annotation_fields_new_name_info_msg = " [update]" 5630 query_dict_remove[ 5631 f"remove 'INFO/{annotation_fields_new_name}'" 5632 ] = query 5633 5634 # Sep between fields in INFO 5635 nb_annotation_field += 1 5636 if nb_annotation_field > 1: 5637 annotation_field_sep = ";" 5638 else: 5639 annotation_field_sep = "" 5640 5641 log.info( 5642 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 5643 ) 5644 5645 # Add INFO field to header 5646 parquet_hdr_vcf_header_infos_number = ( 5647 parquet_hdr_vcf_header_infos[annotation_field].num 5648 or "." 5649 ) 5650 parquet_hdr_vcf_header_infos_type = ( 5651 parquet_hdr_vcf_header_infos[annotation_field].type 5652 or "String" 5653 ) 5654 parquet_hdr_vcf_header_infos_description = ( 5655 parquet_hdr_vcf_header_infos[annotation_field].desc 5656 or f"{annotation_field} description" 5657 ) 5658 parquet_hdr_vcf_header_infos_source = ( 5659 parquet_hdr_vcf_header_infos[annotation_field].source 5660 or "unknown" 5661 ) 5662 parquet_hdr_vcf_header_infos_version = ( 5663 parquet_hdr_vcf_header_infos[annotation_field].version 5664 or "unknown" 5665 ) 5666 5667 vcf_reader.infos[annotation_fields_new_name] = ( 5668 vcf.parser._Info( 5669 annotation_fields_new_name, 5670 parquet_hdr_vcf_header_infos_number, 5671 parquet_hdr_vcf_header_infos_type, 5672 parquet_hdr_vcf_header_infos_description, 5673 parquet_hdr_vcf_header_infos_source, 5674 parquet_hdr_vcf_header_infos_version, 5675 self.code_type_map[ 5676 parquet_hdr_vcf_header_infos_type 5677 ], 5678 ) 5679 ) 5680 5681 # Append 5682 if force_append_annotation: 5683 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 5684 else: 5685 query_case_when_append = "" 5686 5687 # Annotation/Update query fields 5688 # Found in INFO column 5689 if ( 5690 annotation_field_column == "INFO" 5691 and "INFO" in parquet_hdr_vcf_header_columns 5692 ): 5693 sql_query_annotation_update_info_sets.append( 5694 f""" 5695 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 5696 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 5697 ELSE '' 5698 END 5699 """ 5700 ) 5701 # Found in a specific column 5702 else: 5703 sql_query_annotation_update_info_sets.append( 5704 f""" 5705 CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append} 5706 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ',')) 5707 ELSE '' 5708 END 5709 """ 5710 ) 5711 sql_query_annotation_to_agregate.append( 5712 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 5713 ) 5714 5715 # Not to annotate 5716 else: 5717 5718 if force_update_annotation: 5719 annotation_message = "forced" 5720 else: 5721 annotation_message = "skipped" 5722 5723 if annotation_field not in parquet_hdr_vcf_header_infos: 5724 log.warning( 5725 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 5726 ) 5727 if annotation_fields_new_name in self.get_header().infos: 5728 log.warning( 5729 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 5730 ) 5731 5732 # Check if ALL fields have to be annotated. Thus concat all INFO field 5733 # allow_annotation_full_info = True 5734 allow_annotation_full_info = not force_append_annotation 5735 5736 if parquet_type in ["regions"]: 5737 allow_annotation_full_info = False 5738 5739 if ( 5740 allow_annotation_full_info 5741 and nb_annotation_field == len(annotation_fields) 5742 and annotation_fields_all 5743 and ( 5744 "INFO" in parquet_hdr_vcf_header_columns 5745 and "INFO" in database.get_extra_columns() 5746 ) 5747 ): 5748 log.debug("Column INFO annotation enabled") 5749 sql_query_annotation_update_info_sets = [] 5750 sql_query_annotation_update_info_sets.append( 5751 f" table_parquet.INFO " 5752 ) 5753 5754 if sql_query_annotation_update_info_sets: 5755 5756 # Annotate 5757 log.info(f"Annotation '{annotation_name}' - Annotation...") 5758 5759 # Join query annotation update info sets for SQL 5760 sql_query_annotation_update_info_sets_sql = ",".join( 5761 sql_query_annotation_update_info_sets 5762 ) 5763 5764 # Check chromosomes list (and variants infos) 5765 sql_query_chromosomes = f""" 5766 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 5767 FROM {table_variants} as table_variants 5768 GROUP BY table_variants."#CHROM" 5769 ORDER BY table_variants."#CHROM" 5770 """ 5771 sql_query_chromosomes_df = self.conn.execute( 5772 sql_query_chromosomes 5773 ).df() 5774 sql_query_chromosomes_dict = { 5775 entry["CHROM"]: { 5776 "count": entry["count_variants"], 5777 "min": entry["min_variants"], 5778 "max": entry["max_variants"], 5779 } 5780 for index, entry in sql_query_chromosomes_df.iterrows() 5781 } 5782 5783 # Init 5784 nb_of_query = 0 5785 nb_of_variant_annotated = 0 5786 query_dict = query_dict_remove 5787 5788 # for chrom in sql_query_chromosomes_df["CHROM"]: 5789 for chrom in sql_query_chromosomes_dict: 5790 5791 # Number of variant by chromosome 5792 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 5793 chrom, {} 5794 ).get("count", 0) 5795 5796 log.debug( 5797 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 5798 ) 5799 5800 # Annotation with regions database 5801 if parquet_type in ["regions"]: 5802 sql_query_annotation_from_clause = f""" 5803 FROM ( 5804 SELECT 5805 '{chrom}' AS \"#CHROM\", 5806 table_variants_from.\"POS\" AS \"POS\", 5807 {",".join(sql_query_annotation_to_agregate)} 5808 FROM {table_variants} as table_variants_from 5809 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 5810 table_parquet_from."#CHROM" = '{chrom}' 5811 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 5812 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 5813 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 5814 ) 5815 ) 5816 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 5817 GROUP BY table_variants_from.\"POS\" 5818 ) 5819 as table_parquet 5820 """ 5821 5822 sql_query_annotation_where_clause = """ 5823 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5824 AND table_parquet.\"POS\" = table_variants.\"POS\" 5825 """ 5826 5827 # Annotation with variants database 5828 else: 5829 sql_query_annotation_from_clause = f""" 5830 FROM {parquet_file_link} as table_parquet 5831 """ 5832 sql_query_annotation_where_clause = f""" 5833 table_variants."#CHROM" = '{chrom}' 5834 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 5835 AND table_parquet.\"POS\" = table_variants.\"POS\" 5836 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5837 AND table_parquet.\"REF\" = table_variants.\"REF\" 5838 """ 5839 5840 # Create update query 5841 sql_query_annotation_chrom_interval_pos = f""" 5842 UPDATE {table_variants} as table_variants 5843 SET INFO = 5844 concat( 5845 CASE WHEN table_variants.INFO NOT IN ('','.') 5846 THEN table_variants.INFO 5847 ELSE '' 5848 END 5849 , 5850 CASE WHEN table_variants.INFO NOT IN ('','.') 5851 AND ( 5852 concat({sql_query_annotation_update_info_sets_sql}) 5853 ) 5854 NOT IN ('','.') 5855 THEN ';' 5856 ELSE '' 5857 END 5858 , 5859 {sql_query_annotation_update_info_sets_sql} 5860 ) 5861 {sql_query_annotation_from_clause} 5862 WHERE {sql_query_annotation_where_clause} 5863 ; 5864 """ 5865 5866 # Add update query to dict 5867 query_dict[ 5868 f"{chrom} [{nb_of_variant_by_chrom} variants]" 5869 ] = sql_query_annotation_chrom_interval_pos 5870 5871 nb_of_query = len(query_dict) 5872 num_query = 0 5873 5874 # SET max_expression_depth TO x 5875 self.conn.execute("SET max_expression_depth TO 10000") 5876 5877 for query_name in query_dict: 5878 query = query_dict[query_name] 5879 num_query += 1 5880 log.info( 5881 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 5882 ) 5883 result = self.conn.execute(query) 5884 nb_of_variant_annotated_by_query = result.df()["Count"][0] 5885 nb_of_variant_annotated += nb_of_variant_annotated_by_query 5886 log.info( 5887 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 5888 ) 5889 5890 log.info( 5891 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 5892 ) 5893 5894 else: 5895 5896 log.info( 5897 f"Annotation '{annotation_name}' - No Annotations available" 5898 ) 5899 5900 log.debug("Final header: " + str(vcf_reader.infos)) 5901 5902 # Remove added columns 5903 for added_column in added_columns: 5904 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
5906 def annotation_splice(self, threads: int = None) -> None: 5907 """ 5908 This function annotate with snpEff 5909 5910 :param threads: The number of threads to use 5911 :return: the value of the variable "return_value". 5912 """ 5913 5914 # DEBUG 5915 log.debug("Start annotation with splice tools") 5916 5917 # Threads 5918 if not threads: 5919 threads = self.get_threads() 5920 log.debug("Threads: " + str(threads)) 5921 5922 # DEBUG 5923 delete_tmp = True 5924 if self.get_config().get("verbosity", "warning") in ["debug"]: 5925 delete_tmp = False 5926 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5927 5928 # Config 5929 config = self.get_config() 5930 log.debug("Config: " + str(config)) 5931 splice_config = config.get("tools", {}).get("splice", {}) 5932 if not splice_config: 5933 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 5934 if not splice_config: 5935 msg_err = "No Splice tool config" 5936 log.error(msg_err) 5937 raise ValueError(msg_err) 5938 log.debug(f"splice_config={splice_config}") 5939 5940 # Config - Folders - Databases 5941 databases_folders = ( 5942 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 5943 ) 5944 log.debug("Databases annotations: " + str(databases_folders)) 5945 5946 # Splice docker image 5947 splice_docker_image = splice_config.get("docker").get("image") 5948 5949 # Pull splice image if it's not already there 5950 if not check_docker_image_exists(splice_docker_image): 5951 log.warning( 5952 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 5953 ) 5954 try: 5955 command(f"docker pull {splice_config.get('docker').get('image')}") 5956 except subprocess.CalledProcessError: 5957 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 5958 log.error(msg_err) 5959 raise ValueError(msg_err) 5960 return None 5961 5962 # Config - splice databases 5963 splice_databases = ( 5964 config.get("folders", {}) 5965 .get("databases", {}) 5966 .get("splice", DEFAULT_SPLICE_FOLDER) 5967 ) 5968 splice_databases = full_path(splice_databases) 5969 5970 # Param 5971 param = self.get_param() 5972 log.debug("Param: " + str(param)) 5973 5974 # Param 5975 options = param.get("annotation", {}).get("splice", {}) 5976 log.debug("Options: " + str(options)) 5977 5978 # Data 5979 table_variants = self.get_table_variants() 5980 5981 # Check if not empty 5982 log.debug("Check if not empty") 5983 sql_query_chromosomes = ( 5984 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5985 ) 5986 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5987 log.info("VCF empty") 5988 return None 5989 5990 # Export in VCF 5991 log.debug("Create initial file to annotate") 5992 5993 # Create output folder 5994 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 5995 if not os.path.exists(output_folder): 5996 Path(output_folder).mkdir(parents=True, exist_ok=True) 5997 5998 # Create tmp VCF file 5999 tmp_vcf = NamedTemporaryFile( 6000 prefix=self.get_prefix(), 6001 dir=output_folder, 6002 suffix=".vcf", 6003 delete=False, 6004 ) 6005 tmp_vcf_name = tmp_vcf.name 6006 6007 # VCF header 6008 header = self.get_header() 6009 6010 # Existing annotations 6011 for vcf_annotation in self.get_header().infos: 6012 6013 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6014 log.debug( 6015 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6016 ) 6017 6018 # Memory limit 6019 if config.get("memory", None): 6020 memory_limit = config.get("memory", "8G").upper() 6021 # upper() 6022 else: 6023 memory_limit = "8G" 6024 log.debug(f"memory_limit: {memory_limit}") 6025 6026 # Check number of variants to annotate 6027 where_clause_regex_spliceai = r"SpliceAI_\w+" 6028 where_clause_regex_spip = r"SPiP_\w+" 6029 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6030 df_list_of_variants_to_annotate = self.get_query_to_df( 6031 query=f""" SELECT * FROM variants {where_clause} """ 6032 ) 6033 if len(df_list_of_variants_to_annotate) == 0: 6034 log.warning( 6035 f"No variants to annotate with splice. Variants probably already annotated with splice" 6036 ) 6037 return None 6038 else: 6039 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6040 6041 # Export VCF file 6042 self.export_variant_vcf( 6043 vcf_file=tmp_vcf_name, 6044 remove_info=True, 6045 add_samples=True, 6046 index=False, 6047 where_clause=where_clause, 6048 ) 6049 6050 # Create docker container and launch splice analysis 6051 if splice_config: 6052 6053 # Splice mount folders 6054 mount_folders = splice_config.get("mount", {}) 6055 6056 # Genome mount 6057 mount_folders[ 6058 config.get("folders", {}) 6059 .get("databases", {}) 6060 .get("genomes", DEFAULT_GENOME_FOLDER) 6061 ] = "ro" 6062 6063 # SpliceAI mount 6064 mount_folders[ 6065 config.get("folders", {}) 6066 .get("databases", {}) 6067 .get("spliceai", DEFAULT_SPLICEAI_FOLDER) 6068 ] = "ro" 6069 6070 # Genome mount 6071 mount_folders[ 6072 config.get("folders", {}) 6073 .get("databases", {}) 6074 .get("spip", DEFAULT_SPIP_FOLDER) 6075 ] = "ro" 6076 6077 # Mount folders 6078 mount = [] 6079 6080 # Config mount 6081 mount = [ 6082 f"-v {full_path(path)}:{full_path(path)}:{mode}" 6083 for path, mode in mount_folders.items() 6084 ] 6085 6086 if any(value for value in splice_config.values() if value is None): 6087 log.warning("At least one splice config parameter is empty") 6088 return None 6089 6090 # Params in splice nf 6091 def check_values(dico: dict): 6092 """ 6093 Ensure parameters for NF splice pipeline 6094 """ 6095 for key, val in dico.items(): 6096 if key == "genome": 6097 if any( 6098 assemb in options.get("genome", {}) 6099 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6100 ): 6101 yield f"--{key} hg19" 6102 elif any( 6103 assemb in options.get("genome", {}) 6104 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6105 ): 6106 yield f"--{key} hg38" 6107 elif ( 6108 (isinstance(val, str) and val) 6109 or isinstance(val, int) 6110 or isinstance(val, bool) 6111 ): 6112 yield f"--{key} {val}" 6113 6114 # Genome 6115 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6116 options["genome"] = genome 6117 6118 # NF params 6119 nf_params = [] 6120 6121 # Add options 6122 if options: 6123 nf_params = list(check_values(options)) 6124 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6125 else: 6126 log.debug("No NF params provided") 6127 6128 # Add threads 6129 if "threads" not in options.keys(): 6130 nf_params.append(f"--threads {threads}") 6131 6132 # Genome path 6133 genome_path = find_genome( 6134 config.get("folders", {}) 6135 .get("databases", {}) 6136 .get("genomes", DEFAULT_GENOME_FOLDER), 6137 file=f"{genome}.fa", 6138 ) 6139 # Add genome path 6140 if not genome_path: 6141 raise ValueError( 6142 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6143 ) 6144 else: 6145 log.debug(f"Genome: {genome_path}") 6146 nf_params.append(f"--genome_path {genome_path}") 6147 6148 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6149 """ 6150 Setting up updated databases for SPiP and SpliceAI 6151 """ 6152 6153 try: 6154 6155 # SpliceAI assembly transcriptome 6156 spliceai_assembly = os.path.join( 6157 config.get("folders", {}) 6158 .get("databases", {}) 6159 .get("spliceai", {}), 6160 options.get("genome"), 6161 "transcriptome", 6162 ) 6163 spip_assembly = options.get("genome") 6164 6165 spip = find( 6166 f"transcriptome_{spip_assembly}.RData", 6167 config.get("folders", {}).get("databases", {}).get("spip", {}), 6168 ) 6169 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6170 log.debug(f"SPiP annotations: {spip}") 6171 log.debug(f"SpliceAI annotations: {spliceai}") 6172 if spip and spliceai: 6173 return [ 6174 f"--spip_transcriptome {spip}", 6175 f"--spliceai_annotations {spliceai}", 6176 ] 6177 else: 6178 # TODO crash and go on with basic annotations ? 6179 # raise ValueError( 6180 # "Can't find splice databases in configuration EXIT" 6181 # ) 6182 log.warning( 6183 "Can't find splice databases in configuration, use annotations file from image" 6184 ) 6185 except TypeError: 6186 log.warning( 6187 "Can't find splice databases in configuration, use annotations file from image" 6188 ) 6189 return [] 6190 6191 # Add options, check if transcriptome option have already beend provided 6192 if ( 6193 "spip_transcriptome" not in nf_params 6194 and "spliceai_transcriptome" not in nf_params 6195 ): 6196 splice_reference = splice_annotations(options, config) 6197 if splice_reference: 6198 nf_params.extend(splice_reference) 6199 6200 nf_params.append(f"--output_folder {output_folder}") 6201 6202 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6203 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6204 log.debug(cmd) 6205 6206 splice_config["docker"]["command"] = cmd 6207 6208 docker_cmd = get_bin_command( 6209 tool="splice", 6210 bin_type="docker", 6211 config=config, 6212 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6213 add_options=f"--name {random_uuid} {' '.join(mount)}", 6214 ) 6215 6216 # Docker debug 6217 # if splice_config.get("rm_container"): 6218 # rm_container = "--rm" 6219 # else: 6220 # rm_container = "" 6221 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6222 6223 log.debug(docker_cmd) 6224 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6225 log.debug(res.stdout) 6226 if res.stderr: 6227 log.error(res.stderr) 6228 res.check_returncode() 6229 else: 6230 log.warning(f"Splice tool configuration not found: {config}") 6231 6232 # Update variants 6233 log.info("Annotation - Updating...") 6234 # Test find output vcf 6235 log.debug( 6236 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6237 ) 6238 output_vcf = [] 6239 # Wrong folder to look in 6240 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6241 if ( 6242 files 6243 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6244 ): 6245 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6246 # log.debug(os.listdir(options.get("output_folder"))) 6247 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6248 if not output_vcf: 6249 log.debug( 6250 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6251 ) 6252 else: 6253 # Get new header from annotated vcf 6254 log.debug(f"Initial header: {len(header.infos)} fields") 6255 # Create new header with splice infos 6256 new_vcf = Variants(input=output_vcf[0]) 6257 new_vcf_header = new_vcf.get_header().infos 6258 for keys, infos in new_vcf_header.items(): 6259 if keys not in header.infos.keys(): 6260 header.infos[keys] = infos 6261 log.debug(f"New header: {len(header.infos)} fields") 6262 log.debug(f"Splice tmp output: {output_vcf[0]}") 6263 self.update_from_vcf(output_vcf[0]) 6264 6265 # Remove folder 6266 remove_if_exists(output_folder)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6272 def get_config_default(self, name: str) -> dict: 6273 """ 6274 The function `get_config_default` returns a dictionary containing default configurations for 6275 various calculations and prioritizations. 6276 6277 :param name: The `get_config_default` function returns a dictionary containing default 6278 configurations for different calculations and prioritizations. The `name` parameter is used to 6279 specify which specific configuration to retrieve from the dictionary 6280 :type name: str 6281 :return: The function `get_config_default` returns a dictionary containing default configuration 6282 settings for different calculations and prioritizations. The specific configuration settings are 6283 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6284 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6285 returned. If there is no match, an empty dictionary is returned. 6286 """ 6287 6288 config_default = { 6289 "calculations": { 6290 "variant_chr_pos_alt_ref": { 6291 "type": "sql", 6292 "name": "variant_chr_pos_alt_ref", 6293 "description": "Create a variant ID with chromosome, position, alt and ref", 6294 "available": False, 6295 "output_column_name": "variant_chr_pos_alt_ref", 6296 "output_column_type": "String", 6297 "output_column_description": "variant ID with chromosome, position, alt and ref", 6298 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6299 "operation_info": True, 6300 }, 6301 "VARTYPE": { 6302 "type": "sql", 6303 "name": "VARTYPE", 6304 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6305 "available": True, 6306 "output_column_name": "VARTYPE", 6307 "output_column_type": "String", 6308 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6309 "operation_query": """ 6310 CASE 6311 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6312 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6313 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6314 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6315 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6316 ELSE 'UNDEFINED' 6317 END 6318 """, 6319 "info_fields": ["SVTYPE"], 6320 "operation_info": True, 6321 }, 6322 "snpeff_hgvs": { 6323 "type": "python", 6324 "name": "snpeff_hgvs", 6325 "description": "HGVS nomenclatures from snpEff annotation", 6326 "available": True, 6327 "function_name": "calculation_extract_snpeff_hgvs", 6328 "function_params": ["snpeff_hgvs", "ANN"], 6329 }, 6330 "snpeff_ann_explode": { 6331 "type": "python", 6332 "name": "snpeff_ann_explode", 6333 "description": "Explode snpEff annotations with uniquify values", 6334 "available": True, 6335 "function_name": "calculation_snpeff_ann_explode", 6336 "function_params": [False, "fields", "snpeff_", "ANN"], 6337 }, 6338 "snpeff_ann_explode_uniquify": { 6339 "type": "python", 6340 "name": "snpeff_ann_explode_uniquify", 6341 "description": "Explode snpEff annotations", 6342 "available": True, 6343 "function_name": "calculation_snpeff_ann_explode", 6344 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6345 }, 6346 "snpeff_ann_explode_json": { 6347 "type": "python", 6348 "name": "snpeff_ann_explode_json", 6349 "description": "Explode snpEff annotations in JSON format", 6350 "available": True, 6351 "function_name": "calculation_snpeff_ann_explode", 6352 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6353 }, 6354 "NOMEN": { 6355 "type": "python", 6356 "name": "NOMEN", 6357 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field", 6358 "available": True, 6359 "function_name": "calculation_extract_nomen", 6360 "function_params": [], 6361 }, 6362 "FINDBYPIPELINE": { 6363 "type": "python", 6364 "name": "FINDBYPIPELINE", 6365 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6366 "available": True, 6367 "function_name": "calculation_find_by_pipeline", 6368 "function_params": ["findbypipeline"], 6369 }, 6370 "FINDBYSAMPLE": { 6371 "type": "python", 6372 "name": "FINDBYSAMPLE", 6373 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6374 "available": True, 6375 "function_name": "calculation_find_by_pipeline", 6376 "function_params": ["findbysample"], 6377 }, 6378 "GENOTYPECONCORDANCE": { 6379 "type": "python", 6380 "name": "GENOTYPECONCORDANCE", 6381 "description": "Concordance of genotype for multi caller VCF", 6382 "available": True, 6383 "function_name": "calculation_genotype_concordance", 6384 "function_params": [], 6385 }, 6386 "BARCODE": { 6387 "type": "python", 6388 "name": "BARCODE", 6389 "description": "BARCODE as VaRank tool", 6390 "available": True, 6391 "function_name": "calculation_barcode", 6392 "function_params": [], 6393 }, 6394 "BARCODEFAMILY": { 6395 "type": "python", 6396 "name": "BARCODEFAMILY", 6397 "description": "BARCODEFAMILY as VaRank tool", 6398 "available": True, 6399 "function_name": "calculation_barcode_family", 6400 "function_params": ["BCF"], 6401 }, 6402 "TRIO": { 6403 "type": "python", 6404 "name": "TRIO", 6405 "description": "Inheritance for a trio family", 6406 "available": True, 6407 "function_name": "calculation_trio", 6408 "function_params": [], 6409 }, 6410 "VAF": { 6411 "type": "python", 6412 "name": "VAF", 6413 "description": "Variant Allele Frequency (VAF) harmonization", 6414 "available": True, 6415 "function_name": "calculation_vaf_normalization", 6416 "function_params": [], 6417 }, 6418 "VAF_stats": { 6419 "type": "python", 6420 "name": "VAF_stats", 6421 "description": "Variant Allele Frequency (VAF) statistics", 6422 "available": True, 6423 "function_name": "calculation_genotype_stats", 6424 "function_params": ["VAF"], 6425 }, 6426 "DP_stats": { 6427 "type": "python", 6428 "name": "DP_stats", 6429 "description": "Depth (DP) statistics", 6430 "available": True, 6431 "function_name": "calculation_genotype_stats", 6432 "function_params": ["DP"], 6433 }, 6434 "variant_id": { 6435 "type": "python", 6436 "name": "variant_id", 6437 "description": "Variant ID generated from variant position and type", 6438 "available": True, 6439 "function_name": "calculation_variant_id", 6440 "function_params": [], 6441 }, 6442 }, 6443 "prioritizations": { 6444 "default": { 6445 "filter": [ 6446 { 6447 "type": "notequals", 6448 "value": "!PASS|\\.", 6449 "score": 0, 6450 "flag": "FILTERED", 6451 "comment": ["Bad variant quality"], 6452 }, 6453 { 6454 "type": "equals", 6455 "value": "REJECT", 6456 "score": -20, 6457 "flag": "PASS", 6458 "comment": ["Bad variant quality"], 6459 }, 6460 ], 6461 "DP": [ 6462 { 6463 "type": "gte", 6464 "value": "50", 6465 "score": 5, 6466 "flag": "PASS", 6467 "comment": ["DP higher than 50"], 6468 } 6469 ], 6470 "ANN": [ 6471 { 6472 "type": "contains", 6473 "value": "HIGH", 6474 "score": 5, 6475 "flag": "PASS", 6476 "comment": [ 6477 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6478 ], 6479 }, 6480 { 6481 "type": "contains", 6482 "value": "MODERATE", 6483 "score": 3, 6484 "flag": "PASS", 6485 "comment": [ 6486 "A non-disruptive variant that might change protein effectiveness" 6487 ], 6488 }, 6489 { 6490 "type": "contains", 6491 "value": "LOW", 6492 "score": 0, 6493 "flag": "FILTERED", 6494 "comment": [ 6495 "Assumed to be mostly harmless or unlikely to change protein behavior" 6496 ], 6497 }, 6498 { 6499 "type": "contains", 6500 "value": "MODIFIER", 6501 "score": 0, 6502 "flag": "FILTERED", 6503 "comment": [ 6504 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6505 ], 6506 }, 6507 ], 6508 } 6509 }, 6510 } 6511 6512 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6514 def get_config_json( 6515 self, name: str, config_dict: dict = {}, config_file: str = None 6516 ) -> dict: 6517 """ 6518 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6519 default values, a dictionary, and a file. 6520 6521 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6522 the name of the configuration. It is used to identify and retrieve the configuration settings 6523 for a specific component or module 6524 :type name: str 6525 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6526 dictionary that allows you to provide additional configuration settings or overrides. When you 6527 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6528 the key is the configuration setting you want to override or 6529 :type config_dict: dict 6530 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6531 specify the path to a configuration file that contains additional settings. If provided, the 6532 function will read the contents of this file and update the configuration dictionary with the 6533 values found in the file, overriding any existing values with the 6534 :type config_file: str 6535 :return: The function `get_config_json` returns a dictionary containing the configuration 6536 settings. 6537 """ 6538 6539 # Create with default prioritizations 6540 config_default = self.get_config_default(name=name) 6541 configuration = config_default 6542 # log.debug(f"configuration={configuration}") 6543 6544 # Replace prioritizations from dict 6545 for config in config_dict: 6546 configuration[config] = config_dict[config] 6547 6548 # Replace prioritizations from file 6549 config_file = full_path(config_file) 6550 if config_file: 6551 if os.path.exists(config_file): 6552 with open(config_file) as config_file_content: 6553 config_file_dict = json.load(config_file_content) 6554 for config in config_file_dict: 6555 configuration[config] = config_file_dict[config] 6556 else: 6557 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6558 log.error(msg_error) 6559 raise ValueError(msg_error) 6560 6561 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6563 def prioritization(self) -> None: 6564 """ 6565 It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other 6566 INFO fields 6567 """ 6568 6569 # Config 6570 config = self.get_config() 6571 6572 # Param 6573 param = self.get_param() 6574 6575 # Quick Prioritizations 6576 # prioritizations = param.get("prioritization", {}).get("prioritizations", "") 6577 6578 # Configuration profiles 6579 prioritization_config_file = param.get("prioritization", {}).get( 6580 "prioritization_config", None 6581 ) 6582 prioritization_config_file = full_path(prioritization_config_file) 6583 prioritizations_config = self.get_config_json( 6584 name="prioritizations", config_file=prioritization_config_file 6585 ) 6586 6587 # Prioritization options 6588 profiles = param.get("prioritization", {}).get("profiles", []) 6589 if isinstance(profiles, str): 6590 profiles = profiles.split(",") 6591 pzfields = param.get("prioritization", {}).get( 6592 "pzfields", ["PZFlag", "PZScore"] 6593 ) 6594 if isinstance(pzfields, str): 6595 pzfields = pzfields.split(",") 6596 default_profile = param.get("prioritization", {}).get("default_profile", None) 6597 pzfields_sep = param.get("prioritization", {}).get("pzfields_sep", "_") 6598 prioritization_score_mode = param.get("prioritization", {}).get( 6599 "prioritization_score_mode", "HOWARD" 6600 ) 6601 6602 # Quick Prioritizations 6603 # prioritizations = param.get("prioritization", {}).get("prioritizations", None) 6604 prioritizations = param.get("prioritizations", None) 6605 if prioritizations: 6606 log.info("Quick Prioritization:") 6607 for profile in prioritizations.split(","): 6608 if profile not in profiles: 6609 profiles.append(profile) 6610 log.info(f" {profile}") 6611 6612 # If profile "ALL" provided, all profiles in the config profiles 6613 if "ALL" in profiles: 6614 profiles = list(prioritizations_config.keys()) 6615 6616 for profile in profiles: 6617 if prioritizations_config.get(profile, None): 6618 log.debug(f"Profile '{profile}' configured") 6619 else: 6620 msg_error = f"Profile '{profile}' NOT configured" 6621 log.error(msg_error) 6622 raise ValueError(msg_error) 6623 6624 if profiles: 6625 log.info(f"Prioritization... ") 6626 else: 6627 log.debug(f"No profile defined") 6628 return 6629 6630 if not default_profile and len(profiles): 6631 default_profile = profiles[0] 6632 6633 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 6634 log.debug("Profiles to check: " + str(list(profiles))) 6635 6636 # Variables 6637 table_variants = self.get_table_variants(clause="update") 6638 6639 # Added columns 6640 added_columns = [] 6641 6642 # Create list of PZfields 6643 # List of PZFields 6644 list_of_pzfields_original = pzfields + [ 6645 pzfield + pzfields_sep + profile 6646 for pzfield in pzfields 6647 for profile in profiles 6648 ] 6649 list_of_pzfields = [] 6650 log.debug(f"{list_of_pzfields_original}") 6651 6652 # Remove existing PZfields to use if exists 6653 for pzfield in list_of_pzfields_original: 6654 if self.get_header().infos.get(pzfield, None) is None: 6655 list_of_pzfields.append(pzfield) 6656 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 6657 else: 6658 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 6659 6660 if list_of_pzfields: 6661 6662 # Explode Infos fields 6663 explode_infos_prefix = self.get_explode_infos_prefix() 6664 added_columns += self.explode_infos(prefix=explode_infos_prefix) 6665 extra_infos = self.get_extra_infos() 6666 6667 # PZfields tags description 6668 PZfields_INFOS = { 6669 "PZTags": { 6670 "ID": "PZTags", 6671 "Number": ".", 6672 "Type": "String", 6673 "Description": "Variant tags based on annotation criteria", 6674 }, 6675 "PZScore": { 6676 "ID": "PZScore", 6677 "Number": 1, 6678 "Type": "Integer", 6679 "Description": "Variant score based on annotation criteria", 6680 }, 6681 "PZFlag": { 6682 "ID": "PZFlag", 6683 "Number": 1, 6684 "Type": "String", 6685 "Description": "Variant flag based on annotation criteria", 6686 }, 6687 "PZComment": { 6688 "ID": "PZComment", 6689 "Number": ".", 6690 "Type": "String", 6691 "Description": "Variant comment based on annotation criteria", 6692 }, 6693 "PZInfos": { 6694 "ID": "PZInfos", 6695 "Number": ".", 6696 "Type": "String", 6697 "Description": "Variant infos based on annotation criteria", 6698 }, 6699 } 6700 6701 # Create INFO fields if not exist 6702 for field in PZfields_INFOS: 6703 field_ID = PZfields_INFOS[field]["ID"] 6704 field_description = PZfields_INFOS[field]["Description"] 6705 if field_ID not in self.get_header().infos and field_ID in pzfields: 6706 field_description = ( 6707 PZfields_INFOS[field]["Description"] 6708 + f", profile {default_profile}" 6709 ) 6710 self.get_header().infos[field_ID] = vcf.parser._Info( 6711 field_ID, 6712 PZfields_INFOS[field]["Number"], 6713 PZfields_INFOS[field]["Type"], 6714 field_description, 6715 "unknown", 6716 "unknown", 6717 code_type_map[PZfields_INFOS[field]["Type"]], 6718 ) 6719 6720 # Create INFO fields if not exist for each profile 6721 for profile in prioritizations_config: 6722 if profile in profiles or profiles == []: 6723 for field in PZfields_INFOS: 6724 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 6725 field_description = ( 6726 PZfields_INFOS[field]["Description"] 6727 + f", profile {profile}" 6728 ) 6729 if ( 6730 field_ID not in self.get_header().infos 6731 and field in pzfields 6732 ): 6733 self.get_header().infos[field_ID] = vcf.parser._Info( 6734 field_ID, 6735 PZfields_INFOS[field]["Number"], 6736 PZfields_INFOS[field]["Type"], 6737 field_description, 6738 "unknown", 6739 "unknown", 6740 code_type_map[PZfields_INFOS[field]["Type"]], 6741 ) 6742 6743 # Header 6744 for pzfield in list_of_pzfields: 6745 if re.match("PZScore.*", pzfield): 6746 added_column = self.add_column( 6747 table_name=table_variants, 6748 column_name=pzfield, 6749 column_type="INTEGER", 6750 default_value="0", 6751 ) 6752 elif re.match("PZFlag.*", pzfield): 6753 added_column = self.add_column( 6754 table_name=table_variants, 6755 column_name=pzfield, 6756 column_type="BOOLEAN", 6757 default_value="1", 6758 ) 6759 else: 6760 added_column = self.add_column( 6761 table_name=table_variants, 6762 column_name=pzfield, 6763 column_type="STRING", 6764 default_value="''", 6765 ) 6766 added_columns.append(added_column) 6767 6768 # Profiles 6769 if profiles: 6770 6771 # foreach profile in configuration file 6772 for profile in prioritizations_config: 6773 6774 # If profile is asked in param, or ALL are asked (empty profile []) 6775 if profile in profiles or profiles == []: 6776 log.info(f"Profile '{profile}'") 6777 6778 sql_set_info_option = "" 6779 6780 sql_set_info = [] 6781 6782 # PZ fields set 6783 6784 # PZScore 6785 if f"PZScore{pzfields_sep}{profile}" in list_of_pzfields: 6786 sql_set_info.append( 6787 f""" 6788 concat( 6789 'PZScore{pzfields_sep}{profile}=', 6790 PZScore{pzfields_sep}{profile} 6791 ) 6792 """ 6793 ) 6794 if ( 6795 profile == default_profile 6796 and "PZScore" in list_of_pzfields 6797 ): 6798 sql_set_info.append( 6799 f""" 6800 concat( 6801 'PZScore=', 6802 PZScore{pzfields_sep}{profile} 6803 ) 6804 """ 6805 ) 6806 6807 # PZFlag 6808 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6809 sql_set_info.append( 6810 f""" 6811 concat( 6812 'PZFlag{pzfields_sep}{profile}=', 6813 CASE 6814 WHEN PZFlag{pzfields_sep}{profile}==1 6815 THEN 'PASS' 6816 WHEN PZFlag{pzfields_sep}{profile}==0 6817 THEN 'FILTERED' 6818 END 6819 ) 6820 """ 6821 ) 6822 if ( 6823 profile == default_profile 6824 and "PZFlag" in list_of_pzfields 6825 ): 6826 sql_set_info.append( 6827 f""" 6828 concat( 6829 'PZFlag=', 6830 CASE 6831 WHEN PZFlag{pzfields_sep}{profile}==1 6832 THEN 'PASS' 6833 WHEN PZFlag{pzfields_sep}{profile}==0 6834 THEN 'FILTERED' 6835 END 6836 ) 6837 """ 6838 ) 6839 6840 # PZComment 6841 if f"PZComment{pzfields_sep}{profile}" in list_of_pzfields: 6842 sql_set_info.append( 6843 f""" 6844 CASE 6845 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6846 THEN concat('PZComment{pzfields_sep}{profile}=', PZComment{pzfields_sep}{profile}) 6847 ELSE '' 6848 END 6849 """ 6850 ) 6851 if ( 6852 profile == default_profile 6853 and "PZComment" in list_of_pzfields 6854 ): 6855 sql_set_info.append( 6856 f""" 6857 CASE 6858 WHEN PZComment{pzfields_sep}{profile} NOT IN ('') 6859 THEN concat('PZComment=', PZComment{pzfields_sep}{profile}) 6860 ELSE '' 6861 END 6862 """ 6863 ) 6864 6865 # PZInfos 6866 if f"PZInfos{pzfields_sep}{profile}" in list_of_pzfields: 6867 sql_set_info.append( 6868 f""" 6869 CASE 6870 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6871 THEN concat('PZInfos{pzfields_sep}{profile}=', PZInfos{pzfields_sep}{profile}) 6872 ELSE '' 6873 END 6874 """ 6875 ) 6876 if ( 6877 profile == default_profile 6878 and "PZInfos" in list_of_pzfields 6879 ): 6880 sql_set_info.append( 6881 f""" 6882 CASE 6883 WHEN PZInfos{pzfields_sep}{profile} NOT IN ('') 6884 THEN concat('PZInfos=', PZInfos{pzfields_sep}{profile}) 6885 ELSE '' 6886 END 6887 """ 6888 ) 6889 6890 # Merge PZfields 6891 sql_set_info_option = "" 6892 sql_set_sep = "" 6893 for sql_set in sql_set_info: 6894 if sql_set_sep: 6895 sql_set_info_option += f""" 6896 , concat('{sql_set_sep}', {sql_set}) 6897 """ 6898 else: 6899 sql_set_info_option += f""" 6900 , {sql_set} 6901 """ 6902 sql_set_sep = ";" 6903 6904 sql_queries = [] 6905 for annotation in prioritizations_config[profile]: 6906 6907 # Check if annotation field is present 6908 if not f"{explode_infos_prefix}{annotation}" in extra_infos: 6909 log.debug(f"Annotation '{annotation}' not in data") 6910 continue 6911 else: 6912 log.debug(f"Annotation '{annotation}' in data") 6913 6914 # For each criterions 6915 for criterion in prioritizations_config[profile][ 6916 annotation 6917 ]: 6918 criterion_type = criterion["type"] 6919 criterion_value = criterion["value"] 6920 criterion_score = criterion.get("score", 0) 6921 criterion_flag = criterion.get("flag", "PASS") 6922 criterion_flag_bool = criterion_flag == "PASS" 6923 criterion_comment = ( 6924 ", ".join(criterion.get("comment", [])) 6925 .replace("'", "''") 6926 .replace(";", ",") 6927 .replace("\t", " ") 6928 ) 6929 criterion_infos = ( 6930 str(criterion) 6931 .replace("'", "''") 6932 .replace(";", ",") 6933 .replace("\t", " ") 6934 ) 6935 6936 sql_set = [] 6937 sql_set_info = [] 6938 6939 # PZ fields set 6940 if ( 6941 f"PZScore{pzfields_sep}{profile}" 6942 in list_of_pzfields 6943 ): 6944 if prioritization_score_mode == "HOWARD": 6945 sql_set.append( 6946 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6947 ) 6948 elif prioritization_score_mode == "VaRank": 6949 sql_set.append( 6950 f"PZScore{pzfields_sep}{profile} = CASE WHEN {criterion_score}>PZScore{pzfields_sep}{profile} THEN {criterion_score} END" 6951 ) 6952 else: 6953 sql_set.append( 6954 f"PZScore{pzfields_sep}{profile} = PZScore{pzfields_sep}{profile} + {criterion_score}" 6955 ) 6956 if f"PZFlag{pzfields_sep}{profile}" in list_of_pzfields: 6957 sql_set.append( 6958 f"PZFlag{pzfields_sep}{profile} = PZFlag{pzfields_sep}{profile} AND {criterion_flag_bool}" 6959 ) 6960 if ( 6961 f"PZComment{pzfields_sep}{profile}" 6962 in list_of_pzfields 6963 ): 6964 sql_set.append( 6965 f""" 6966 PZComment{pzfields_sep}{profile} = 6967 concat( 6968 PZComment{pzfields_sep}{profile}, 6969 CASE 6970 WHEN PZComment{pzfields_sep}{profile}!='' 6971 THEN ', ' 6972 ELSE '' 6973 END, 6974 '{criterion_comment}' 6975 ) 6976 """ 6977 ) 6978 if ( 6979 f"PZInfos{pzfields_sep}{profile}" 6980 in list_of_pzfields 6981 ): 6982 sql_set.append( 6983 f""" 6984 PZInfos{pzfields_sep}{profile} = 6985 concat( 6986 PZInfos{pzfields_sep}{profile}, 6987 '{criterion_infos}' 6988 ) 6989 """ 6990 ) 6991 sql_set_option = ",".join(sql_set) 6992 6993 # Criterion and comparison 6994 try: 6995 float(criterion_value) 6996 sql_update = f""" 6997 UPDATE {table_variants} 6998 SET {sql_set_option} 6999 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7000 AND "{explode_infos_prefix}{annotation}"{comparison_map[criterion_type]}{criterion_value} 7001 """ 7002 except: 7003 contains_option = "" 7004 if criterion_type == "contains": 7005 contains_option = ".*" 7006 sql_update = f""" 7007 UPDATE {table_variants} 7008 SET {sql_set_option} 7009 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7010 """ 7011 sql_queries.append(sql_update) 7012 7013 # PZTags 7014 if f"PZTags{pzfields_sep}{profile}" in list_of_pzfields: 7015 7016 # Create PZFalgs value 7017 pztags_value = "" 7018 pztags_sep_default = "|" 7019 pztags_sep = "" 7020 for pzfield in pzfields: 7021 if pzfield not in ["PZTags"]: 7022 if ( 7023 f"{pzfield}{pzfields_sep}{profile}" 7024 in list_of_pzfields 7025 ): 7026 if pzfield in ["PZFlag"]: 7027 pztags_value += f"""{pztags_sep}{pzfield}#', 7028 CASE WHEN PZFlag{pzfields_sep}{profile} 7029 THEN 'PASS' 7030 ELSE 'FILTERED' 7031 END, '""" 7032 else: 7033 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7034 pztags_sep = pztags_sep_default 7035 7036 # Add Query update for PZFlags 7037 sql_update_pztags = f""" 7038 UPDATE {table_variants} 7039 SET INFO = concat( 7040 INFO, 7041 CASE WHEN INFO NOT in ('','.') 7042 THEN ';' 7043 ELSE '' 7044 END, 7045 'PZTags{pzfields_sep}{profile}={pztags_value}' 7046 ) 7047 """ 7048 sql_queries.append(sql_update_pztags) 7049 7050 # Add Query update for PZFlags for default 7051 if profile == default_profile: 7052 sql_update_pztags_default = f""" 7053 UPDATE {table_variants} 7054 SET INFO = concat( 7055 INFO, 7056 ';', 7057 'PZTags={pztags_value}' 7058 ) 7059 """ 7060 sql_queries.append(sql_update_pztags_default) 7061 7062 log.info(f"""Profile '{profile}' - Prioritization... """) 7063 7064 if sql_queries: 7065 7066 for sql_query in sql_queries: 7067 log.debug( 7068 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7069 ) 7070 self.conn.execute(sql_query) 7071 7072 log.info(f"""Profile '{profile}' - Update... """) 7073 sql_query_update = f""" 7074 UPDATE {table_variants} 7075 SET INFO = 7076 concat( 7077 CASE 7078 WHEN INFO NOT IN ('','.') 7079 THEN concat(INFO, ';') 7080 ELSE '' 7081 END 7082 {sql_set_info_option} 7083 ) 7084 """ 7085 self.conn.execute(sql_query_update) 7086 7087 else: 7088 7089 log.warning(f"No profiles in parameters") 7090 7091 # Remove added columns 7092 for added_column in added_columns: 7093 self.drop_column(column=added_column) 7094 7095 # Explode INFOS fields into table fields 7096 if self.get_explode_infos(): 7097 self.explode_infos( 7098 prefix=self.get_explode_infos_prefix(), 7099 fields=self.get_explode_infos_fields(), 7100 force=True, 7101 ) 7102 7103 return
It takes a VCF file, and adds a bunch of new INFO fields to it, based on the values of other INFO fields
7109 def annotation_hgvs(self, threads: int = None) -> None: 7110 """ 7111 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7112 coordinates and alleles. 7113 7114 :param threads: The `threads` parameter is an optional integer that specifies the number of 7115 threads to use for parallel processing. If no value is provided, it will default to the number 7116 of threads obtained from the `get_threads()` method 7117 :type threads: int 7118 """ 7119 7120 # Function for each partition of the Dask Dataframe 7121 def partition_function(partition): 7122 """ 7123 The function `partition_function` applies the `annotation_hgvs_partition` function to 7124 each row of a DataFrame called `partition`. 7125 7126 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7127 to be processed 7128 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7129 the "partition" dataframe along the axis 1. 7130 """ 7131 return partition.apply(annotation_hgvs_partition, axis=1) 7132 7133 def annotation_hgvs_partition(row) -> str: 7134 """ 7135 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7136 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7137 7138 :param row: A dictionary-like object that contains the values for the following keys: 7139 :return: a string that contains the HGVS names associated with the given row of data. 7140 """ 7141 7142 chr = row["CHROM"] 7143 pos = row["POS"] 7144 ref = row["REF"] 7145 alt = row["ALT"] 7146 7147 # Find list of associated transcripts 7148 transcripts_list = list( 7149 polars_conn.execute( 7150 f""" 7151 SELECT transcript 7152 FROM refseq_df 7153 WHERE CHROM='{chr}' 7154 AND POS={pos} 7155 """ 7156 )["transcript"] 7157 ) 7158 7159 # Full HGVS annotation in list 7160 hgvs_full_list = [] 7161 7162 for transcript_name in transcripts_list: 7163 7164 # Transcript 7165 transcript = get_transcript( 7166 transcripts=transcripts, transcript_name=transcript_name 7167 ) 7168 # Exon 7169 if use_exon: 7170 exon = transcript.find_exon_number(pos) 7171 else: 7172 exon = None 7173 # Protein 7174 transcript_protein = None 7175 if use_protein or add_protein or full_format: 7176 transcripts_protein = list( 7177 polars_conn.execute( 7178 f""" 7179 SELECT protein 7180 FROM refseqlink_df 7181 WHERE transcript='{transcript_name}' 7182 LIMIT 1 7183 """ 7184 )["protein"] 7185 ) 7186 if len(transcripts_protein): 7187 transcript_protein = transcripts_protein[0] 7188 7189 # HGVS name 7190 hgvs_name = format_hgvs_name( 7191 chr, 7192 pos, 7193 ref, 7194 alt, 7195 genome=genome, 7196 transcript=transcript, 7197 transcript_protein=transcript_protein, 7198 exon=exon, 7199 use_gene=use_gene, 7200 use_protein=use_protein, 7201 full_format=full_format, 7202 use_version=use_version, 7203 codon_type=codon_type, 7204 ) 7205 hgvs_full_list.append(hgvs_name) 7206 if add_protein and not use_protein and not full_format: 7207 hgvs_name = format_hgvs_name( 7208 chr, 7209 pos, 7210 ref, 7211 alt, 7212 genome=genome, 7213 transcript=transcript, 7214 transcript_protein=transcript_protein, 7215 exon=exon, 7216 use_gene=use_gene, 7217 use_protein=True, 7218 full_format=False, 7219 use_version=use_version, 7220 codon_type=codon_type, 7221 ) 7222 hgvs_full_list.append(hgvs_name) 7223 7224 # Create liste of HGVS annotations 7225 hgvs_full = ",".join(hgvs_full_list) 7226 7227 return hgvs_full 7228 7229 # Polars connexion 7230 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7231 7232 # Config 7233 config = self.get_config() 7234 7235 # Databases 7236 # Genome 7237 databases_genomes_folders = ( 7238 config.get("folders", {}) 7239 .get("databases", {}) 7240 .get("genomes", DEFAULT_GENOME_FOLDER) 7241 ) 7242 databases_genome = ( 7243 config.get("folders", {}).get("databases", {}).get("genomes", "") 7244 ) 7245 # refseq database folder 7246 databases_refseq_folders = ( 7247 config.get("folders", {}) 7248 .get("databases", {}) 7249 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7250 ) 7251 # refseq 7252 databases_refseq = config.get("databases", {}).get("refSeq", None) 7253 # refSeqLink 7254 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7255 7256 # Param 7257 param = self.get_param() 7258 7259 # Quick HGVS 7260 if "hgvs_options" in param and param.get("hgvs_options", ""): 7261 log.info(f"Quick HGVS Annotation:") 7262 if not param.get("hgvs", None): 7263 param["hgvs"] = {} 7264 for option in param.get("hgvs_options", "").split(","): 7265 option_var_val = option.split("=") 7266 option_var = option_var_val[0] 7267 if len(option_var_val) > 1: 7268 option_val = option_var_val[1] 7269 else: 7270 option_val = "True" 7271 if option_val.upper() in ["TRUE"]: 7272 option_val = True 7273 elif option_val.upper() in ["FALSE"]: 7274 option_val = False 7275 log.info(f" {option_var}={option_val}") 7276 param["hgvs"][option_var] = option_val 7277 7278 # Check if HGVS annotation enabled 7279 if "hgvs" in param: 7280 log.info(f"HGVS Annotation... ") 7281 for hgvs_option in param.get("hgvs", {}): 7282 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7283 else: 7284 return 7285 7286 # HGVS Param 7287 param_hgvs = param.get("hgvs", {}) 7288 use_exon = param_hgvs.get("use_exon", False) 7289 use_gene = param_hgvs.get("use_gene", False) 7290 use_protein = param_hgvs.get("use_protein", False) 7291 add_protein = param_hgvs.get("add_protein", False) 7292 full_format = param_hgvs.get("full_format", False) 7293 use_version = param_hgvs.get("use_version", False) 7294 codon_type = param_hgvs.get("codon_type", "3") 7295 7296 # refSseq refSeqLink 7297 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7298 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7299 7300 # Assembly 7301 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7302 7303 # Genome 7304 genome_file = None 7305 if find_genome(databases_genome): 7306 genome_file = find_genome(databases_genome) 7307 else: 7308 genome_file = find_genome( 7309 genome_path=databases_genomes_folders, assembly=assembly 7310 ) 7311 log.debug("Genome: " + str(genome_file)) 7312 7313 # refSseq 7314 refseq_file = find_file_prefix( 7315 input_file=databases_refseq, 7316 prefix="ncbiRefSeq", 7317 folder=databases_refseq_folders, 7318 assembly=assembly, 7319 ) 7320 log.debug("refSeq: " + str(refseq_file)) 7321 7322 # refSeqLink 7323 refseqlink_file = find_file_prefix( 7324 input_file=databases_refseqlink, 7325 prefix="ncbiRefSeqLink", 7326 folder=databases_refseq_folders, 7327 assembly=assembly, 7328 ) 7329 log.debug("refSeqLink: " + str(refseqlink_file)) 7330 7331 # Threads 7332 if not threads: 7333 threads = self.get_threads() 7334 log.debug("Threads: " + str(threads)) 7335 7336 # Variables 7337 table_variants = self.get_table_variants(clause="update") 7338 7339 # Get variants SNV and InDel only 7340 query_variants = f""" 7341 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7342 FROM {table_variants} 7343 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7344 """ 7345 df_variants = self.get_query_to_df(query_variants) 7346 7347 # Added columns 7348 added_columns = [] 7349 7350 # Add hgvs column in variants table 7351 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7352 added_column = self.add_column( 7353 table_variants, hgvs_column_name, "STRING", default_value=None 7354 ) 7355 added_columns.append(added_column) 7356 7357 log.debug(f"refSeq loading...") 7358 # refSeq in duckDB 7359 refseq_table = get_refseq_table( 7360 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7361 ) 7362 # Loading all refSeq in Dataframe 7363 refseq_query = f""" 7364 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 7365 FROM {refseq_table} 7366 JOIN df_variants ON ( 7367 {refseq_table}.chrom = df_variants.CHROM 7368 AND {refseq_table}.txStart<=df_variants.POS 7369 AND {refseq_table}.txEnd>=df_variants.POS 7370 ) 7371 """ 7372 refseq_df = self.conn.query(refseq_query).pl() 7373 7374 if refseqlink_file: 7375 log.debug(f"refSeqLink loading...") 7376 # refSeqLink in duckDB 7377 refseqlink_table = get_refseq_table( 7378 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 7379 ) 7380 # Loading all refSeqLink in Dataframe 7381 protacc_column = "protAcc_with_ver" 7382 mrnaacc_column = "mrnaAcc_with_ver" 7383 refseqlink_query = f""" 7384 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 7385 FROM {refseqlink_table} 7386 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 7387 WHERE protAcc_without_ver IS NOT NULL 7388 """ 7389 # Polars Dataframe 7390 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 7391 7392 # Read RefSeq transcripts into a python dict/model. 7393 log.debug(f"Transcripts loading...") 7394 with tempfile.TemporaryDirectory() as tmpdir: 7395 transcripts_query = f""" 7396 COPY ( 7397 SELECT {refseq_table}.* 7398 FROM {refseq_table} 7399 JOIN df_variants ON ( 7400 {refseq_table}.chrom=df_variants.CHROM 7401 AND {refseq_table}.txStart<=df_variants.POS 7402 AND {refseq_table}.txEnd>=df_variants.POS 7403 ) 7404 ) 7405 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 7406 """ 7407 self.conn.query(transcripts_query) 7408 with open(f"{tmpdir}/transcript.tsv") as infile: 7409 transcripts = read_transcripts(infile) 7410 7411 # Polars connexion 7412 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7413 7414 log.debug("Genome loading...") 7415 # Read genome sequence using pyfaidx. 7416 genome = Fasta(genome_file) 7417 7418 log.debug("Start annotation HGVS...") 7419 7420 # Create 7421 # a Dask Dataframe from Pandas dataframe with partition as number of threads 7422 ddf = dd.from_pandas(df_variants, npartitions=threads) 7423 7424 # Use dask.dataframe.apply() to apply function on each partition 7425 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 7426 7427 # Convert Dask DataFrame to Pandas Dataframe 7428 df = ddf.compute() 7429 7430 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 7431 with tempfile.TemporaryDirectory() as tmpdir: 7432 df_parquet = os.path.join(tmpdir, "df.parquet") 7433 df.to_parquet(df_parquet) 7434 7435 # Update hgvs column 7436 update_variant_query = f""" 7437 UPDATE {table_variants} 7438 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 7439 FROM read_parquet('{df_parquet}') as df 7440 WHERE variants."#CHROM" = df.CHROM 7441 AND variants.POS = df.POS 7442 AND variants.REF = df.REF 7443 AND variants.ALT = df.ALT 7444 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 7445 """ 7446 self.execute_query(update_variant_query) 7447 7448 # Update INFO column 7449 sql_query_update = f""" 7450 UPDATE {table_variants} 7451 SET INFO = 7452 concat( 7453 CASE 7454 WHEN INFO NOT IN ('','.') 7455 THEN concat(INFO, ';') 7456 ELSE '' 7457 END, 7458 'hgvs=', 7459 {hgvs_column_name} 7460 ) 7461 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 7462 """ 7463 self.execute_query(sql_query_update) 7464 7465 # Add header 7466 HGVS_INFOS = { 7467 "hgvs": { 7468 "ID": "hgvs", 7469 "Number": ".", 7470 "Type": "String", 7471 "Description": f"HGVS annotatation with HOWARD", 7472 } 7473 } 7474 7475 for field in HGVS_INFOS: 7476 field_ID = HGVS_INFOS[field]["ID"] 7477 field_description = HGVS_INFOS[field]["Description"] 7478 self.get_header().infos[field_ID] = vcf.parser._Info( 7479 field_ID, 7480 HGVS_INFOS[field]["Number"], 7481 HGVS_INFOS[field]["Type"], 7482 field_description, 7483 "unknown", 7484 "unknown", 7485 code_type_map[HGVS_INFOS[field]["Type"]], 7486 ) 7487 7488 # Remove added columns 7489 for added_column in added_columns: 7490 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
7496 def get_operations_help( 7497 self, operations_config_dict: dict = {}, operations_config_file: str = None 7498 ) -> list: 7499 7500 # Init 7501 operations_help = [] 7502 7503 # operations 7504 operations = self.get_config_json( 7505 name="calculations", 7506 config_dict=operations_config_dict, 7507 config_file=operations_config_file, 7508 ) 7509 for op in operations: 7510 op_name = operations[op].get("name", op).upper() 7511 op_description = operations[op].get("description", op_name) 7512 op_available = operations[op].get("available", False) 7513 if op_available: 7514 operations_help.append(f" {op_name}: {op_description}") 7515 7516 # Sort operations 7517 operations_help.sort() 7518 7519 # insert header 7520 operations_help.insert(0, "Available calculation operations:") 7521 7522 # Return 7523 return operations_help
7525 def calculation( 7526 self, 7527 operations: dict = {}, 7528 operations_config_dict: dict = {}, 7529 operations_config_file: str = None, 7530 ) -> None: 7531 """ 7532 It takes a list of operations, and for each operation, it checks if it's a python or sql 7533 operation, and then calls the appropriate function 7534 7535 param json example: 7536 "calculation": { 7537 "NOMEN": { 7538 "options": { 7539 "hgvs_field": "hgvs" 7540 }, 7541 "middle" : null 7542 } 7543 """ 7544 7545 # Param 7546 param = self.get_param() 7547 7548 # operations config 7549 operations_config = self.get_config_json( 7550 name="calculations", 7551 config_dict=operations_config_dict, 7552 config_file=operations_config_file, 7553 ) 7554 7555 # Upper keys 7556 operations_config = {k.upper(): v for k, v in operations_config.items()} 7557 7558 # Calculations 7559 7560 # Operations from param 7561 operations = param.get("calculation", {}).get("calculations", operations) 7562 7563 # Quick calculation - add 7564 if param.get("calculations", None): 7565 calculations_list = [ 7566 value for value in param.get("calculations", "").split(",") 7567 ] 7568 log.info(f"Quick Calculations:") 7569 for calculation_key in calculations_list: 7570 log.info(f" {calculation_key}") 7571 for calculation_operation in calculations_list: 7572 if calculation_operation.upper() not in operations: 7573 operations[calculation_operation.upper()] = {} 7574 add_value_into_dict( 7575 dict_tree=param, 7576 sections=[ 7577 "calculation", 7578 "calculations", 7579 calculation_operation.upper(), 7580 ], 7581 value={}, 7582 ) 7583 7584 # Operations for calculation 7585 if not operations: 7586 operations = param.get("calculation", {}).get("calculations", {}) 7587 7588 if operations: 7589 log.info(f"Calculations...") 7590 7591 # For each operations 7592 for operation_name in operations: 7593 operation_name = operation_name.upper() 7594 if operation_name not in [""]: 7595 if operation_name in operations_config: 7596 log.info(f"Calculation '{operation_name}'") 7597 operation = operations_config[operation_name] 7598 operation_type = operation.get("type", "sql") 7599 if operation_type == "python": 7600 self.calculation_process_function( 7601 operation=operation, operation_name=operation_name 7602 ) 7603 elif operation_type == "sql": 7604 self.calculation_process_sql( 7605 operation=operation, operation_name=operation_name 7606 ) 7607 else: 7608 log.error( 7609 f"Operations config: Type '{operation_type}' NOT available" 7610 ) 7611 raise ValueError( 7612 f"Operations config: Type '{operation_type}' NOT available" 7613 ) 7614 else: 7615 log.error( 7616 f"Operations config: Calculation '{operation_name}' NOT available" 7617 ) 7618 raise ValueError( 7619 f"Operations config: Calculation '{operation_name}' NOT available" 7620 ) 7621 7622 # Explode INFOS fields into table fields 7623 if self.get_explode_infos(): 7624 self.explode_infos( 7625 prefix=self.get_explode_infos_prefix(), 7626 fields=self.get_explode_infos_fields(), 7627 force=True, 7628 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
7630 def calculation_process_sql( 7631 self, operation: dict, operation_name: str = "unknown" 7632 ) -> None: 7633 """ 7634 The `calculation_process_sql` function takes in a mathematical operation as a string and 7635 performs the operation, updating the specified table with the result. 7636 7637 :param operation: The `operation` parameter is a dictionary that contains information about the 7638 mathematical operation to be performed. It includes the following keys: 7639 :type operation: dict 7640 :param operation_name: The `operation_name` parameter is a string that represents the name of 7641 the mathematical operation being performed. It is used for logging and error handling purposes, 7642 defaults to unknown 7643 :type operation_name: str (optional) 7644 """ 7645 7646 # table variants 7647 table_variants = self.get_table_variants(clause="alter") 7648 7649 # Operation infos 7650 operation_name = operation.get("name", "unknown") 7651 log.debug(f"process sql {operation_name}") 7652 output_column_name = operation.get("output_column_name", operation_name) 7653 output_column_type = operation.get("output_column_type", "String") 7654 prefix = operation.get("explode_infos_prefix", "") 7655 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 7656 output_column_description = operation.get( 7657 "output_column_description", f"{operation_name} operation" 7658 ) 7659 operation_query = operation.get("operation_query", None) 7660 if isinstance(operation_query, list): 7661 operation_query = " ".join(operation_query) 7662 operation_info_fields = operation.get("info_fields", []) 7663 operation_info_fields_check = operation.get("info_fields_check", False) 7664 operation_info = operation.get("operation_info", True) 7665 7666 if operation_query: 7667 7668 # Info fields check 7669 operation_info_fields_check_result = True 7670 if operation_info_fields_check: 7671 header_infos = self.get_header().infos 7672 for info_field in operation_info_fields: 7673 operation_info_fields_check_result = ( 7674 operation_info_fields_check_result 7675 and info_field in header_infos 7676 ) 7677 7678 # If info fields available 7679 if operation_info_fields_check_result: 7680 7681 # Added_columns 7682 added_columns = [] 7683 7684 # Create VCF header field 7685 vcf_reader = self.get_header() 7686 vcf_reader.infos[output_column_name] = vcf.parser._Info( 7687 output_column_name, 7688 ".", 7689 output_column_type, 7690 output_column_description, 7691 "howard calculation", 7692 "0", 7693 self.code_type_map.get(output_column_type), 7694 ) 7695 7696 # Explode infos if needed 7697 log.debug(f"calculation_process_sql prefix {prefix}") 7698 added_columns += self.explode_infos( 7699 prefix=prefix, 7700 fields=[output_column_name] + operation_info_fields, 7701 force=True, 7702 ) 7703 7704 # Create column 7705 added_column = self.add_column( 7706 table_name=table_variants, 7707 column_name=prefix + output_column_name, 7708 column_type=output_column_type_sql, 7709 default_value="null", 7710 ) 7711 added_columns.append(added_column) 7712 7713 # Operation calculation 7714 try: 7715 7716 # Query to update calculation column 7717 sql_update = f""" 7718 UPDATE {table_variants} 7719 SET "{prefix}{output_column_name}" = ({operation_query}) 7720 """ 7721 self.conn.execute(sql_update) 7722 7723 # Add to INFO 7724 if operation_info: 7725 sql_update_info = f""" 7726 UPDATE {table_variants} 7727 SET "INFO" = 7728 concat( 7729 CASE 7730 WHEN "INFO" IS NOT NULL 7731 THEN concat("INFO", ';') 7732 ELSE '' 7733 END, 7734 '{output_column_name}=', 7735 "{prefix}{output_column_name}" 7736 ) 7737 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 7738 """ 7739 self.conn.execute(sql_update_info) 7740 7741 except: 7742 log.error( 7743 f"Operations config: Calculation '{operation_name}' query failed" 7744 ) 7745 raise ValueError( 7746 f"Operations config: Calculation '{operation_name}' query failed" 7747 ) 7748 7749 # Remove added columns 7750 for added_column in added_columns: 7751 log.debug(f"added_column: {added_column}") 7752 self.drop_column(column=added_column) 7753 7754 else: 7755 log.error( 7756 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7757 ) 7758 raise ValueError( 7759 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 7760 ) 7761 7762 else: 7763 log.error( 7764 f"Operations config: Calculation '{operation_name}' query NOT defined" 7765 ) 7766 raise ValueError( 7767 f"Operations config: Calculation '{operation_name}' query NOT defined" 7768 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
7770 def calculation_process_function( 7771 self, operation: dict, operation_name: str = "unknown" 7772 ) -> None: 7773 """ 7774 The `calculation_process_function` takes in an operation dictionary and performs the specified 7775 function with the given parameters. 7776 7777 :param operation: The `operation` parameter is a dictionary that contains information about the 7778 operation to be performed. It has the following keys: 7779 :type operation: dict 7780 :param operation_name: The `operation_name` parameter is a string that represents the name of 7781 the operation being performed. It is used for logging purposes, defaults to unknown 7782 :type operation_name: str (optional) 7783 """ 7784 7785 operation_name = operation["name"] 7786 log.debug(f"process sql {operation_name}") 7787 function_name = operation["function_name"] 7788 function_params = operation["function_params"] 7789 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
7791 def calculation_variant_id(self) -> None: 7792 """ 7793 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 7794 updates the INFO field of a variants table with the variant ID. 7795 """ 7796 7797 # variant_id annotation field 7798 variant_id_tag = self.get_variant_id_column() 7799 added_columns = [variant_id_tag] 7800 7801 # variant_id hgvs tags" 7802 vcf_infos_tags = { 7803 variant_id_tag: "howard variant ID annotation", 7804 } 7805 7806 # Variants table 7807 table_variants = self.get_table_variants() 7808 7809 # Header 7810 vcf_reader = self.get_header() 7811 7812 # Add variant_id to header 7813 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 7814 variant_id_tag, 7815 ".", 7816 "String", 7817 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 7818 "howard calculation", 7819 "0", 7820 self.code_type_map.get("String"), 7821 ) 7822 7823 # Update 7824 sql_update = f""" 7825 UPDATE {table_variants} 7826 SET "INFO" = 7827 concat( 7828 CASE 7829 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7830 THEN '' 7831 ELSE concat("INFO", ';') 7832 END, 7833 '{variant_id_tag}=', 7834 "{variant_id_tag}" 7835 ) 7836 """ 7837 self.conn.execute(sql_update) 7838 7839 # Remove added columns 7840 for added_column in added_columns: 7841 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
7843 def calculation_extract_snpeff_hgvs( 7844 self, 7845 snpeff_hgvs: str = "snpeff_hgvs", 7846 snpeff_field: str = "ANN", 7847 ) -> None: 7848 """ 7849 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 7850 annotation field in a VCF file and adds them as a new column in the variants table. 7851 7852 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 7853 function is used to specify the name of the column that will store the HGVS nomenclatures 7854 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 7855 snpeff_hgvs 7856 :type snpeff_hgvs: str (optional) 7857 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 7858 function represents the field in the VCF file that contains SnpEff annotations. This field is 7859 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 7860 to ANN 7861 :type snpeff_field: str (optional) 7862 """ 7863 7864 # Snpeff hgvs tags 7865 vcf_infos_tags = { 7866 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 7867 } 7868 7869 # Prefix 7870 prefix = self.get_explode_infos_prefix() 7871 if prefix: 7872 prefix = "INFO/" 7873 7874 # snpEff fields 7875 speff_ann_infos = prefix + snpeff_field 7876 speff_hgvs_infos = prefix + snpeff_hgvs 7877 7878 # Variants table 7879 table_variants = self.get_table_variants() 7880 7881 # Header 7882 vcf_reader = self.get_header() 7883 7884 # Add columns 7885 added_columns = [] 7886 7887 # Explode HGVS field in column 7888 added_columns += self.explode_infos(fields=[snpeff_field]) 7889 7890 if snpeff_field in vcf_reader.infos: 7891 7892 log.debug(vcf_reader.infos[snpeff_field]) 7893 7894 # Extract ANN header 7895 ann_description = vcf_reader.infos[snpeff_field].desc 7896 pattern = r"'(.+?)'" 7897 match = re.search(pattern, ann_description) 7898 if match: 7899 ann_header_match = match.group(1).split(" | ") 7900 ann_header_desc = {} 7901 for i in range(len(ann_header_match)): 7902 ann_header_info = "".join( 7903 char for char in ann_header_match[i] if char.isalnum() 7904 ) 7905 ann_header_desc[ann_header_info] = ann_header_match[i] 7906 if not ann_header_desc: 7907 raise ValueError("Invalid header description format") 7908 else: 7909 raise ValueError("Invalid header description format") 7910 7911 # Create variant id 7912 variant_id_column = self.get_variant_id_column() 7913 added_columns += [variant_id_column] 7914 7915 # Create dataframe 7916 dataframe_snpeff_hgvs = self.get_query_to_df( 7917 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 7918 ) 7919 7920 # Create main NOMEN column 7921 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 7922 speff_ann_infos 7923 ].apply( 7924 lambda x: extract_snpeff_hgvs( 7925 str(x), header=list(ann_header_desc.values()) 7926 ) 7927 ) 7928 7929 # Add snpeff_hgvs to header 7930 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 7931 snpeff_hgvs, 7932 ".", 7933 "String", 7934 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 7935 "howard calculation", 7936 "0", 7937 self.code_type_map.get("String"), 7938 ) 7939 7940 # Update 7941 sql_update = f""" 7942 UPDATE variants 7943 SET "INFO" = 7944 concat( 7945 CASE 7946 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 7947 THEN '' 7948 ELSE concat("INFO", ';') 7949 END, 7950 CASE 7951 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 7952 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 7953 THEN concat( 7954 '{snpeff_hgvs}=', 7955 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 7956 ) 7957 ELSE '' 7958 END 7959 ) 7960 FROM dataframe_snpeff_hgvs 7961 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 7962 7963 """ 7964 self.conn.execute(sql_update) 7965 7966 # Delete dataframe 7967 del dataframe_snpeff_hgvs 7968 gc.collect() 7969 7970 else: 7971 7972 log.warning( 7973 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 7974 ) 7975 7976 # Remove added columns 7977 for added_column in added_columns: 7978 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
7980 def calculation_snpeff_ann_explode( 7981 self, 7982 uniquify: bool = True, 7983 output_format: str = "fields", 7984 output_prefix: str = "snpeff_", 7985 snpeff_field: str = "ANN", 7986 ) -> None: 7987 """ 7988 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 7989 exploding the HGVS field and updating variant information accordingly. 7990 7991 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 7992 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 7993 it indicates that the output should be unique, meaning that duplicate entries should be removed, 7994 defaults to True 7995 :type uniquify: bool (optional) 7996 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 7997 function specifies the format in which the output annotations will be generated. It has a 7998 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 7999 format, defaults to fields 8000 :type output_format: str (optional) 8001 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8002 method is used to specify the prefix that will be added to the output annotations generated 8003 during the calculation process. This prefix helps to differentiate the newly added annotations 8004 from existing ones in the output data. By default, the, defaults to ANN_ 8005 :type output_prefix: str (optional) 8006 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8007 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8008 field will be processed to explode the HGVS annotations and update the variant information 8009 accordingly, defaults to ANN 8010 :type snpeff_field: str (optional) 8011 """ 8012 8013 # SnpEff annotation field 8014 snpeff_hgvs = "snpeff_ann_explode" 8015 8016 # Snpeff hgvs tags 8017 vcf_infos_tags = { 8018 snpeff_hgvs: "Explode snpEff annotations", 8019 } 8020 8021 # Prefix 8022 prefix = self.get_explode_infos_prefix() 8023 if prefix: 8024 prefix = "INFO/" 8025 8026 # snpEff fields 8027 speff_ann_infos = prefix + snpeff_field 8028 speff_hgvs_infos = prefix + snpeff_hgvs 8029 8030 # Variants table 8031 table_variants = self.get_table_variants() 8032 8033 # Header 8034 vcf_reader = self.get_header() 8035 8036 # Add columns 8037 added_columns = [] 8038 8039 # Explode HGVS field in column 8040 added_columns += self.explode_infos(fields=[snpeff_field]) 8041 log.debug(f"snpeff_field={snpeff_field}") 8042 log.debug(f"added_columns={added_columns}") 8043 8044 if snpeff_field in vcf_reader.infos: 8045 8046 # Extract ANN header 8047 ann_description = vcf_reader.infos[snpeff_field].desc 8048 pattern = r"'(.+?)'" 8049 match = re.search(pattern, ann_description) 8050 if match: 8051 ann_header_match = match.group(1).split(" | ") 8052 ann_header = [] 8053 ann_header_desc = {} 8054 for i in range(len(ann_header_match)): 8055 ann_header_info = "".join( 8056 char for char in ann_header_match[i] if char.isalnum() 8057 ) 8058 ann_header.append(ann_header_info) 8059 ann_header_desc[ann_header_info] = ann_header_match[i] 8060 if not ann_header_desc: 8061 raise ValueError("Invalid header description format") 8062 else: 8063 raise ValueError("Invalid header description format") 8064 8065 # Create variant id 8066 variant_id_column = self.get_variant_id_column() 8067 added_columns += [variant_id_column] 8068 8069 # Create dataframe 8070 dataframe_snpeff_hgvs = self.get_query_to_df( 8071 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8072 ) 8073 8074 # Create snpEff columns 8075 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8076 speff_ann_infos 8077 ].apply( 8078 lambda x: explode_snpeff_ann( 8079 str(x), 8080 uniquify=uniquify, 8081 output_format=output_format, 8082 prefix=output_prefix, 8083 header=list(ann_header_desc.values()), 8084 ) 8085 ) 8086 8087 # Header 8088 ann_annotations_prefix = "" 8089 if output_format.upper() in ["JSON"]: 8090 ann_annotations_prefix = f"{output_prefix}=" 8091 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8092 output_prefix, 8093 ".", 8094 "String", 8095 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8096 + " - JSON format", 8097 "howard calculation", 8098 "0", 8099 self.code_type_map.get("String"), 8100 ) 8101 else: 8102 for ann_annotation in ann_header: 8103 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8104 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8105 ann_annotation_id, 8106 ".", 8107 "String", 8108 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8109 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8110 "howard calculation", 8111 "0", 8112 self.code_type_map.get("String"), 8113 ) 8114 8115 # Update 8116 sql_update = f""" 8117 UPDATE variants 8118 SET "INFO" = 8119 concat( 8120 CASE 8121 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8122 THEN '' 8123 ELSE concat("INFO", ';') 8124 END, 8125 CASE 8126 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8127 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8128 THEN concat( 8129 '{ann_annotations_prefix}', 8130 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8131 ) 8132 ELSE '' 8133 END 8134 ) 8135 FROM dataframe_snpeff_hgvs 8136 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8137 8138 """ 8139 self.conn.execute(sql_update) 8140 8141 # Delete dataframe 8142 del dataframe_snpeff_hgvs 8143 gc.collect() 8144 8145 else: 8146 8147 log.warning( 8148 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8149 ) 8150 8151 # Remove added columns 8152 for added_column in added_columns: 8153 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8155 def calculation_extract_nomen(self) -> None: 8156 """ 8157 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8158 """ 8159 8160 # NOMEN field 8161 field_nomen_dict = "NOMEN_DICT" 8162 8163 # NOMEN structure 8164 nomen_dict = { 8165 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8166 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8167 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8168 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8169 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8170 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8171 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8172 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8173 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8174 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8175 } 8176 8177 # Param 8178 param = self.get_param() 8179 8180 # Prefix 8181 prefix = self.get_explode_infos_prefix() 8182 8183 # Header 8184 vcf_reader = self.get_header() 8185 8186 # Get HGVS field 8187 hgvs_field = ( 8188 param.get("calculation", {}) 8189 .get("calculations", {}) 8190 .get("NOMEN", {}) 8191 .get("options", {}) 8192 .get("hgvs_field", "hgvs") 8193 ) 8194 8195 # Get transcripts 8196 transcripts_file = ( 8197 param.get("calculation", {}) 8198 .get("calculations", {}) 8199 .get("NOMEN", {}) 8200 .get("options", {}) 8201 .get("transcripts", None) 8202 ) 8203 transcripts_file = full_path(transcripts_file) 8204 transcripts = [] 8205 if transcripts_file: 8206 if os.path.exists(transcripts_file): 8207 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8208 transcripts = transcripts_dataframe.iloc[:, 0].tolist() 8209 else: 8210 log.error(f"Transcript file '{transcripts_file}' does NOT exist") 8211 raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist") 8212 8213 # Added columns 8214 added_columns = [] 8215 8216 # Explode HGVS field in column 8217 added_columns += self.explode_infos(fields=[hgvs_field]) 8218 8219 # extra infos 8220 extra_infos = self.get_extra_infos() 8221 extra_field = prefix + hgvs_field 8222 8223 if extra_field in extra_infos: 8224 8225 # Create dataframe 8226 dataframe_hgvs = self.get_query_to_df( 8227 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """ 8228 ) 8229 8230 # Create main NOMEN column 8231 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply( 8232 lambda x: find_nomen(str(x), transcripts=transcripts) 8233 ) 8234 8235 # Explode NOMEN Structure and create SQL set for update 8236 sql_nomen_fields = [] 8237 for nomen_field in nomen_dict: 8238 8239 # Explode each field into a column 8240 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8241 lambda x: dict(x).get(nomen_field, "") 8242 ) 8243 8244 # Create VCF header field 8245 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8246 nomen_field, 8247 ".", 8248 "String", 8249 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8250 "howard calculation", 8251 "0", 8252 self.code_type_map.get("String"), 8253 ) 8254 sql_nomen_fields.append( 8255 f""" 8256 CASE 8257 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8258 THEN concat( 8259 ';{nomen_field}=', 8260 dataframe_hgvs."{nomen_field}" 8261 ) 8262 ELSE '' 8263 END 8264 """ 8265 ) 8266 8267 # SQL set for update 8268 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8269 8270 # Update 8271 sql_update = f""" 8272 UPDATE variants 8273 SET "INFO" = 8274 concat( 8275 CASE 8276 WHEN "INFO" IS NULL 8277 THEN '' 8278 ELSE "INFO" 8279 END, 8280 {sql_nomen_fields_set} 8281 ) 8282 FROM dataframe_hgvs 8283 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8284 AND variants."POS" = dataframe_hgvs."POS" 8285 AND variants."REF" = dataframe_hgvs."REF" 8286 AND variants."ALT" = dataframe_hgvs."ALT" 8287 """ 8288 self.conn.execute(sql_update) 8289 8290 # Delete dataframe 8291 del dataframe_hgvs 8292 gc.collect() 8293 8294 # Remove added columns 8295 for added_column in added_columns: 8296 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8298 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 8299 """ 8300 The function `calculation_find_by_pipeline` performs a calculation to find the number of 8301 pipeline/sample for a variant and updates the variant information in a VCF file. 8302 8303 :param tag: The `tag` parameter is a string that represents the annotation field for the 8304 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 8305 VCF header and to update the corresponding field in the variants table, defaults to 8306 findbypipeline 8307 :type tag: str (optional) 8308 """ 8309 8310 # if FORMAT and samples 8311 if ( 8312 "FORMAT" in self.get_header_columns_as_list() 8313 and self.get_header_sample_list() 8314 ): 8315 8316 # findbypipeline annotation field 8317 findbypipeline_tag = tag 8318 8319 # VCF infos tags 8320 vcf_infos_tags = { 8321 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 8322 } 8323 8324 # Prefix 8325 prefix = self.get_explode_infos_prefix() 8326 8327 # Field 8328 findbypipeline_infos = prefix + findbypipeline_tag 8329 8330 # Variants table 8331 table_variants = self.get_table_variants() 8332 8333 # Header 8334 vcf_reader = self.get_header() 8335 8336 # Create variant id 8337 variant_id_column = self.get_variant_id_column() 8338 added_columns = [variant_id_column] 8339 8340 # variant_id, FORMAT and samples 8341 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8342 self.get_header_sample_list() 8343 ) 8344 8345 # Create dataframe 8346 dataframe_findbypipeline = self.get_query_to_df( 8347 f""" SELECT {samples_fields} FROM {table_variants} """ 8348 ) 8349 8350 # Create findbypipeline column 8351 dataframe_findbypipeline[findbypipeline_infos] = ( 8352 dataframe_findbypipeline.apply( 8353 lambda row: findbypipeline( 8354 row, samples=self.get_header_sample_list() 8355 ), 8356 axis=1, 8357 ) 8358 ) 8359 8360 # Add snpeff_hgvs to header 8361 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 8362 findbypipeline_tag, 8363 ".", 8364 "String", 8365 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 8366 "howard calculation", 8367 "0", 8368 self.code_type_map.get("String"), 8369 ) 8370 8371 # Update 8372 sql_update = f""" 8373 UPDATE variants 8374 SET "INFO" = 8375 concat( 8376 CASE 8377 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8378 THEN '' 8379 ELSE concat("INFO", ';') 8380 END, 8381 CASE 8382 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 8383 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 8384 THEN concat( 8385 '{findbypipeline_tag}=', 8386 dataframe_findbypipeline."{findbypipeline_infos}" 8387 ) 8388 ELSE '' 8389 END 8390 ) 8391 FROM dataframe_findbypipeline 8392 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 8393 """ 8394 self.conn.execute(sql_update) 8395 8396 # Remove added columns 8397 for added_column in added_columns: 8398 self.drop_column(column=added_column) 8399 8400 # Delete dataframe 8401 del dataframe_findbypipeline 8402 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
8404 def calculation_genotype_concordance(self) -> None: 8405 """ 8406 The function `calculation_genotype_concordance` calculates the genotype concordance for 8407 multi-caller VCF files and updates the variant information in the database. 8408 """ 8409 8410 # if FORMAT and samples 8411 if ( 8412 "FORMAT" in self.get_header_columns_as_list() 8413 and self.get_header_sample_list() 8414 ): 8415 8416 # genotypeconcordance annotation field 8417 genotypeconcordance_tag = "genotypeconcordance" 8418 8419 # VCF infos tags 8420 vcf_infos_tags = { 8421 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 8422 } 8423 8424 # Prefix 8425 prefix = self.get_explode_infos_prefix() 8426 8427 # Field 8428 genotypeconcordance_infos = prefix + genotypeconcordance_tag 8429 8430 # Variants table 8431 table_variants = self.get_table_variants() 8432 8433 # Header 8434 vcf_reader = self.get_header() 8435 8436 # Create variant id 8437 variant_id_column = self.get_variant_id_column() 8438 added_columns = [variant_id_column] 8439 8440 # variant_id, FORMAT and samples 8441 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8442 self.get_header_sample_list() 8443 ) 8444 8445 # Create dataframe 8446 dataframe_genotypeconcordance = self.get_query_to_df( 8447 f""" SELECT {samples_fields} FROM {table_variants} """ 8448 ) 8449 8450 # Create genotypeconcordance column 8451 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 8452 dataframe_genotypeconcordance.apply( 8453 lambda row: genotypeconcordance( 8454 row, samples=self.get_header_sample_list() 8455 ), 8456 axis=1, 8457 ) 8458 ) 8459 8460 # Add genotypeconcordance to header 8461 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 8462 genotypeconcordance_tag, 8463 ".", 8464 "String", 8465 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 8466 "howard calculation", 8467 "0", 8468 self.code_type_map.get("String"), 8469 ) 8470 8471 # Update 8472 sql_update = f""" 8473 UPDATE variants 8474 SET "INFO" = 8475 concat( 8476 CASE 8477 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8478 THEN '' 8479 ELSE concat("INFO", ';') 8480 END, 8481 CASE 8482 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 8483 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 8484 THEN concat( 8485 '{genotypeconcordance_tag}=', 8486 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 8487 ) 8488 ELSE '' 8489 END 8490 ) 8491 FROM dataframe_genotypeconcordance 8492 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 8493 """ 8494 self.conn.execute(sql_update) 8495 8496 # Remove added columns 8497 for added_column in added_columns: 8498 self.drop_column(column=added_column) 8499 8500 # Delete dataframe 8501 del dataframe_genotypeconcordance 8502 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
8504 def calculation_barcode(self, tag: str = "barcode") -> None: 8505 """ 8506 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 8507 updates the INFO field in the file with the calculated barcode values. 8508 8509 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 8510 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 8511 the default tag name is set to "barcode", defaults to barcode 8512 :type tag: str (optional) 8513 """ 8514 8515 # if FORMAT and samples 8516 if ( 8517 "FORMAT" in self.get_header_columns_as_list() 8518 and self.get_header_sample_list() 8519 ): 8520 8521 # barcode annotation field 8522 if not tag: 8523 tag = "barcode" 8524 8525 # VCF infos tags 8526 vcf_infos_tags = { 8527 tag: "barcode calculation (VaRank)", 8528 } 8529 8530 # Prefix 8531 prefix = self.get_explode_infos_prefix() 8532 8533 # Field 8534 barcode_infos = prefix + tag 8535 8536 # Variants table 8537 table_variants = self.get_table_variants() 8538 8539 # Header 8540 vcf_reader = self.get_header() 8541 8542 # Create variant id 8543 variant_id_column = self.get_variant_id_column() 8544 added_columns = [variant_id_column] 8545 8546 # variant_id, FORMAT and samples 8547 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8548 self.get_header_sample_list() 8549 ) 8550 8551 # Create dataframe 8552 dataframe_barcode = self.get_query_to_df( 8553 f""" SELECT {samples_fields} FROM {table_variants} """ 8554 ) 8555 8556 # Create barcode column 8557 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8558 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 8559 ) 8560 8561 # Add barcode to header 8562 vcf_reader.infos[tag] = vcf.parser._Info( 8563 tag, 8564 ".", 8565 "String", 8566 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 8567 "howard calculation", 8568 "0", 8569 self.code_type_map.get("String"), 8570 ) 8571 8572 # Update 8573 sql_update = f""" 8574 UPDATE {table_variants} 8575 SET "INFO" = 8576 concat( 8577 CASE 8578 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8579 THEN '' 8580 ELSE concat("INFO", ';') 8581 END, 8582 CASE 8583 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 8584 AND dataframe_barcode."{barcode_infos}" NOT NULL 8585 THEN concat( 8586 '{tag}=', 8587 dataframe_barcode."{barcode_infos}" 8588 ) 8589 ELSE '' 8590 END 8591 ) 8592 FROM dataframe_barcode 8593 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8594 """ 8595 self.conn.execute(sql_update) 8596 8597 # Remove added columns 8598 for added_column in added_columns: 8599 self.drop_column(column=added_column) 8600 8601 # Delete dataframe 8602 del dataframe_barcode 8603 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
8605 def calculation_barcode_family(self, tag: str = "BCF") -> None: 8606 """ 8607 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 8608 and updates the INFO field in the file with the calculated barcode values. 8609 8610 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 8611 the barcode tag that will be added to the VCF file during the calculation process. If no value 8612 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 8613 :type tag: str (optional) 8614 """ 8615 8616 # if FORMAT and samples 8617 if ( 8618 "FORMAT" in self.get_header_columns_as_list() 8619 and self.get_header_sample_list() 8620 ): 8621 8622 # barcode annotation field 8623 if not tag: 8624 tag = "BCF" 8625 8626 # VCF infos tags 8627 vcf_infos_tags = { 8628 tag: "barcode family calculation", 8629 f"{tag}S": "barcode family samples", 8630 } 8631 8632 # Param 8633 param = self.get_param() 8634 log.debug(f"param={param}") 8635 8636 # Prefix 8637 prefix = self.get_explode_infos_prefix() 8638 8639 # PED param 8640 ped = ( 8641 param.get("calculation", {}) 8642 .get("calculations", {}) 8643 .get("BARCODEFAMILY", {}) 8644 .get("family_pedigree", None) 8645 ) 8646 log.debug(f"ped={ped}") 8647 8648 # Load PED 8649 if ped: 8650 8651 # Pedigree is a file 8652 if isinstance(ped, str) and os.path.exists(full_path(ped)): 8653 log.debug("Pedigree is file") 8654 with open(full_path(ped)) as ped: 8655 ped = json.load(ped) 8656 8657 # Pedigree is a string 8658 elif isinstance(ped, str): 8659 log.debug("Pedigree is str") 8660 try: 8661 ped = json.loads(ped) 8662 log.debug("Pedigree is json str") 8663 except ValueError as e: 8664 ped_samples = ped.split(",") 8665 ped = {} 8666 for ped_sample in ped_samples: 8667 ped[ped_sample] = ped_sample 8668 8669 # Pedigree is a dict 8670 elif isinstance(ped, dict): 8671 log.debug("Pedigree is dict") 8672 8673 # Pedigree is not well formatted 8674 else: 8675 msg_error = "Pedigree not well formatted" 8676 log.error(msg_error) 8677 raise ValueError(msg_error) 8678 8679 # Construct list 8680 ped_samples = list(ped.values()) 8681 8682 else: 8683 log.debug("Pedigree not defined. Take all samples") 8684 ped_samples = self.get_header_sample_list() 8685 ped = {} 8686 for ped_sample in ped_samples: 8687 ped[ped_sample] = ped_sample 8688 8689 # Check pedigree 8690 if not ped or len(ped) == 0: 8691 msg_error = f"Error in pedigree: samples {ped_samples}" 8692 log.error(msg_error) 8693 raise ValueError(msg_error) 8694 8695 # Log 8696 log.info( 8697 "Calculation 'BARCODEFAMILY' - Samples: " 8698 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 8699 ) 8700 log.debug(f"ped_samples={ped_samples}") 8701 8702 # Field 8703 barcode_infos = prefix + tag 8704 8705 # Variants table 8706 table_variants = self.get_table_variants() 8707 8708 # Header 8709 vcf_reader = self.get_header() 8710 8711 # Create variant id 8712 variant_id_column = self.get_variant_id_column() 8713 added_columns = [variant_id_column] 8714 8715 # variant_id, FORMAT and samples 8716 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8717 ped_samples 8718 ) 8719 8720 # Create dataframe 8721 dataframe_barcode = self.get_query_to_df( 8722 f""" SELECT {samples_fields} FROM {table_variants} """ 8723 ) 8724 8725 # Create barcode column 8726 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 8727 lambda row: barcode(row, samples=ped_samples), axis=1 8728 ) 8729 8730 # Add barcode family to header 8731 # Add vaf_normalization to header 8732 vcf_reader.formats[tag] = vcf.parser._Format( 8733 id=tag, 8734 num=".", 8735 type="String", 8736 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 8737 type_code=self.code_type_map.get("String"), 8738 ) 8739 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 8740 id=f"{tag}S", 8741 num=".", 8742 type="String", 8743 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 8744 type_code=self.code_type_map.get("String"), 8745 ) 8746 8747 # Update 8748 # for sample in ped_samples: 8749 sql_update_set = [] 8750 for sample in self.get_header_sample_list() + ["FORMAT"]: 8751 if sample in ped_samples: 8752 value = f'dataframe_barcode."{barcode_infos}"' 8753 value_samples = "'" + ",".join(ped_samples) + "'" 8754 elif sample == "FORMAT": 8755 value = f"'{tag}'" 8756 value_samples = f"'{tag}S'" 8757 else: 8758 value = "'.'" 8759 value_samples = "'.'" 8760 format_regex = r"[a-zA-Z0-9\s]" 8761 sql_update_set.append( 8762 f""" 8763 "{sample}" = 8764 concat( 8765 CASE 8766 WHEN {table_variants}."{sample}" = './.' 8767 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 8768 ELSE {table_variants}."{sample}" 8769 END, 8770 ':', 8771 {value}, 8772 ':', 8773 {value_samples} 8774 ) 8775 """ 8776 ) 8777 8778 sql_update_set_join = ", ".join(sql_update_set) 8779 sql_update = f""" 8780 UPDATE {table_variants} 8781 SET {sql_update_set_join} 8782 FROM dataframe_barcode 8783 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 8784 """ 8785 self.conn.execute(sql_update) 8786 8787 # Remove added columns 8788 for added_column in added_columns: 8789 self.drop_column(column=added_column) 8790 8791 # Delete dataframe 8792 del dataframe_barcode 8793 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
8795 def calculation_trio(self) -> None: 8796 """ 8797 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 8798 information to the INFO field of each variant. 8799 """ 8800 8801 # if FORMAT and samples 8802 if ( 8803 "FORMAT" in self.get_header_columns_as_list() 8804 and self.get_header_sample_list() 8805 ): 8806 8807 # trio annotation field 8808 trio_tag = "trio" 8809 8810 # VCF infos tags 8811 vcf_infos_tags = { 8812 "trio": "trio calculation", 8813 } 8814 8815 # Param 8816 param = self.get_param() 8817 8818 # Prefix 8819 prefix = self.get_explode_infos_prefix() 8820 8821 # Trio param 8822 trio_ped = ( 8823 param.get("calculation", {}) 8824 .get("calculations", {}) 8825 .get("TRIO", {}) 8826 .get("trio_pedigree", None) 8827 ) 8828 8829 # Load trio 8830 if trio_ped: 8831 8832 # Trio pedigree is a file 8833 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 8834 log.debug("TRIO pedigree is file") 8835 with open(full_path(trio_ped)) as trio_ped: 8836 trio_ped = json.load(trio_ped) 8837 8838 # Trio pedigree is a string 8839 elif isinstance(trio_ped, str): 8840 log.debug("TRIO pedigree is str") 8841 try: 8842 trio_ped = json.loads(trio_ped) 8843 log.debug("TRIO pedigree is json str") 8844 except ValueError as e: 8845 trio_samples = trio_ped.split(",") 8846 if len(trio_samples) == 3: 8847 trio_ped = { 8848 "father": trio_samples[0], 8849 "mother": trio_samples[1], 8850 "child": trio_samples[2], 8851 } 8852 log.debug("TRIO pedigree is list str") 8853 else: 8854 msg_error = "TRIO pedigree not well formatted" 8855 log.error(msg_error) 8856 raise ValueError(msg_error) 8857 8858 # Trio pedigree is a dict 8859 elif isinstance(trio_ped, dict): 8860 log.debug("TRIO pedigree is dict") 8861 8862 # Trio pedigree is not well formatted 8863 else: 8864 msg_error = "TRIO pedigree not well formatted" 8865 log.error(msg_error) 8866 raise ValueError(msg_error) 8867 8868 # Construct trio list 8869 trio_samples = [ 8870 trio_ped.get("father", ""), 8871 trio_ped.get("mother", ""), 8872 trio_ped.get("child", ""), 8873 ] 8874 8875 else: 8876 log.debug("TRIO pedigree not defined. Take the first 3 samples") 8877 samples_list = self.get_header_sample_list() 8878 if len(samples_list) >= 3: 8879 trio_samples = self.get_header_sample_list()[0:3] 8880 trio_ped = { 8881 "father": trio_samples[0], 8882 "mother": trio_samples[1], 8883 "child": trio_samples[2], 8884 } 8885 else: 8886 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 8887 log.error(msg_error) 8888 raise ValueError(msg_error) 8889 8890 # Check trio pedigree 8891 if not trio_ped or len(trio_ped) != 3: 8892 msg_error = f"Error in TRIO pedigree: {trio_ped}" 8893 log.error(msg_error) 8894 raise ValueError(msg_error) 8895 8896 # Log 8897 log.info( 8898 f"Calculation 'TRIO' - Samples: " 8899 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 8900 ) 8901 8902 # Field 8903 trio_infos = prefix + trio_tag 8904 8905 # Variants table 8906 table_variants = self.get_table_variants() 8907 8908 # Header 8909 vcf_reader = self.get_header() 8910 8911 # Create variant id 8912 variant_id_column = self.get_variant_id_column() 8913 added_columns = [variant_id_column] 8914 8915 # variant_id, FORMAT and samples 8916 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 8917 self.get_header_sample_list() 8918 ) 8919 8920 # Create dataframe 8921 dataframe_trio = self.get_query_to_df( 8922 f""" SELECT {samples_fields} FROM {table_variants} """ 8923 ) 8924 8925 # Create trio column 8926 dataframe_trio[trio_infos] = dataframe_trio.apply( 8927 lambda row: trio(row, samples=trio_samples), axis=1 8928 ) 8929 8930 # Add trio to header 8931 vcf_reader.infos[trio_tag] = vcf.parser._Info( 8932 trio_tag, 8933 ".", 8934 "String", 8935 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 8936 "howard calculation", 8937 "0", 8938 self.code_type_map.get("String"), 8939 ) 8940 8941 # Update 8942 sql_update = f""" 8943 UPDATE {table_variants} 8944 SET "INFO" = 8945 concat( 8946 CASE 8947 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8948 THEN '' 8949 ELSE concat("INFO", ';') 8950 END, 8951 CASE 8952 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 8953 AND dataframe_trio."{trio_infos}" NOT NULL 8954 THEN concat( 8955 '{trio_tag}=', 8956 dataframe_trio."{trio_infos}" 8957 ) 8958 ELSE '' 8959 END 8960 ) 8961 FROM dataframe_trio 8962 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 8963 """ 8964 self.conn.execute(sql_update) 8965 8966 # Remove added columns 8967 for added_column in added_columns: 8968 self.drop_column(column=added_column) 8969 8970 # Delete dataframe 8971 del dataframe_trio 8972 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
8974 def calculation_vaf_normalization(self) -> None: 8975 """ 8976 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 8977 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 8978 :return: The function does not return anything. 8979 """ 8980 8981 # if FORMAT and samples 8982 if ( 8983 "FORMAT" in self.get_header_columns_as_list() 8984 and self.get_header_sample_list() 8985 ): 8986 8987 # vaf_normalization annotation field 8988 vaf_normalization_tag = "VAF" 8989 8990 # VCF infos tags 8991 vcf_infos_tags = { 8992 "VAF": "VAF Variant Frequency", 8993 } 8994 8995 # Prefix 8996 prefix = self.get_explode_infos_prefix() 8997 8998 # Variants table 8999 table_variants = self.get_table_variants() 9000 9001 # Header 9002 vcf_reader = self.get_header() 9003 9004 # Do not calculate if VAF already exists 9005 if "VAF" in vcf_reader.formats: 9006 log.debug("VAF already on genotypes") 9007 return 9008 9009 # Create variant id 9010 variant_id_column = self.get_variant_id_column() 9011 added_columns = [variant_id_column] 9012 9013 # variant_id, FORMAT and samples 9014 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9015 self.get_header_sample_list() 9016 ) 9017 9018 # Create dataframe 9019 dataframe_vaf_normalization = self.get_query_to_df( 9020 f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9021 ) 9022 9023 vaf_normalization_set = [] 9024 9025 # for each sample vaf_normalization 9026 for sample in self.get_header_sample_list(): 9027 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9028 lambda row: vaf_normalization(row, sample=sample), axis=1 9029 ) 9030 vaf_normalization_set.append( 9031 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9032 ) 9033 9034 # Add VAF to FORMAT 9035 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9036 "FORMAT" 9037 ].apply(lambda x: str(x) + ":VAF") 9038 vaf_normalization_set.append( 9039 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9040 ) 9041 9042 # Add vaf_normalization to header 9043 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9044 id=vaf_normalization_tag, 9045 num="1", 9046 type="Float", 9047 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9048 type_code=self.code_type_map.get("Float"), 9049 ) 9050 9051 # Create fields to add in INFO 9052 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9053 9054 # Update 9055 sql_update = f""" 9056 UPDATE {table_variants} 9057 SET {sql_vaf_normalization_set} 9058 FROM dataframe_vaf_normalization 9059 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9060 9061 """ 9062 self.conn.execute(sql_update) 9063 9064 # Remove added columns 9065 for added_column in added_columns: 9066 self.drop_column(column=added_column) 9067 9068 # Delete dataframe 9069 del dataframe_vaf_normalization 9070 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9072 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9073 """ 9074 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9075 field in a VCF file and updates the INFO column of the variants table with the calculated 9076 statistics. 9077 9078 :param info: The `info` parameter is a string that represents the type of information for which 9079 genotype statistics are calculated. It is used to generate various VCF info tags for the 9080 statistics, such as the number of occurrences, the list of values, the minimum value, the 9081 maximum value, the mean, the median, defaults to VAF 9082 :type info: str (optional) 9083 """ 9084 9085 # if FORMAT and samples 9086 if ( 9087 "FORMAT" in self.get_header_columns_as_list() 9088 and self.get_header_sample_list() 9089 ): 9090 9091 # vaf_stats annotation field 9092 vaf_stats_tag = info + "_stats" 9093 9094 # VCF infos tags 9095 vcf_infos_tags = { 9096 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9097 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9098 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9099 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9100 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9101 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9102 info 9103 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9104 } 9105 9106 # Prefix 9107 prefix = self.get_explode_infos_prefix() 9108 9109 # Field 9110 vaf_stats_infos = prefix + vaf_stats_tag 9111 9112 # Variants table 9113 table_variants = self.get_table_variants() 9114 9115 # Header 9116 vcf_reader = self.get_header() 9117 9118 # Create variant id 9119 variant_id_column = self.get_variant_id_column() 9120 added_columns = [variant_id_column] 9121 9122 # variant_id, FORMAT and samples 9123 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9124 self.get_header_sample_list() 9125 ) 9126 9127 # Create dataframe 9128 dataframe_vaf_stats = self.get_query_to_df( 9129 f""" SELECT {samples_fields} FROM {table_variants} """ 9130 ) 9131 9132 # Create vaf_stats column 9133 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9134 lambda row: genotype_stats( 9135 row, samples=self.get_header_sample_list(), info=info 9136 ), 9137 axis=1, 9138 ) 9139 9140 # List of vcf tags 9141 sql_vaf_stats_fields = [] 9142 9143 # Check all VAF stats infos 9144 for stat in vcf_infos_tags: 9145 9146 # Extract stats 9147 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9148 lambda x: dict(x).get(stat, "") 9149 ) 9150 9151 # Add snpeff_hgvs to header 9152 vcf_reader.infos[stat] = vcf.parser._Info( 9153 stat, 9154 ".", 9155 "String", 9156 vcf_infos_tags.get(stat, "genotype statistics"), 9157 "howard calculation", 9158 "0", 9159 self.code_type_map.get("String"), 9160 ) 9161 9162 if len(sql_vaf_stats_fields): 9163 sep = ";" 9164 else: 9165 sep = "" 9166 9167 # Create fields to add in INFO 9168 sql_vaf_stats_fields.append( 9169 f""" 9170 CASE 9171 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9172 THEN concat( 9173 '{sep}{stat}=', 9174 dataframe_vaf_stats."{stat}" 9175 ) 9176 ELSE '' 9177 END 9178 """ 9179 ) 9180 9181 # SQL set for update 9182 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9183 9184 # Update 9185 sql_update = f""" 9186 UPDATE variants 9187 SET "INFO" = 9188 concat( 9189 CASE 9190 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9191 THEN '' 9192 ELSE concat("INFO", ';') 9193 END, 9194 {sql_vaf_stats_fields_set} 9195 ) 9196 FROM dataframe_vaf_stats 9197 WHERE variants."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9198 9199 """ 9200 self.conn.execute(sql_update) 9201 9202 # Remove added columns 9203 for added_column in added_columns: 9204 self.drop_column(column=added_column) 9205 9206 # Delete dataframe 9207 del dataframe_vaf_stats 9208 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF